#Loading required Libraries
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>")) # Increase cell width
display(HTML("<style>.rendered_html { font-size: 16px; }</style>")) # Increase font size
import warnings
warnings.filterwarnings('ignore')
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import numpy as np
import statsmodels.api as sm
import sklearn
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn import tree
from dtreeviz.trees import dtreeviz
#Loading required Datasets
pm_df = pd.read_csv("power_market.csv")
pm_df.head(5)
| fc_demand | fc_nuclear | import_FR | export_FR | fc_wind | fc_solar_pv | fc_solar_th | price | date | hour | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 24400.0 | 7117.2 | 3000.0 | 2600.0 | 1732.0 | 0.0 | 5.1 | 58.82 | 2017-01-01 | 0 |
| 1 | 23616.0 | 7117.2 | 3000.0 | 2650.0 | 1826.0 | 0.0 | 0.6 | 58.23 | 2017-01-01 | 1 |
| 2 | 21893.0 | 7117.2 | 3000.0 | 2650.0 | 1823.0 | 0.0 | 4.6 | 51.95 | 2017-01-01 | 2 |
| 3 | 20693.0 | 7117.2 | 3000.0 | 2650.0 | 1777.0 | 0.0 | 9.7 | 47.27 | 2017-01-01 | 3 |
| 4 | 19599.0 | 7117.2 | 3000.0 | 2650.0 | 1746.0 | 0.0 | 24.1 | 45.49 | 2017-01-01 | 4 |
#Getting a quick preview of the data:
pm_df.describe(include = 'all')
| fc_demand | fc_nuclear | import_FR | export_FR | fc_wind | fc_solar_pv | fc_solar_th | price | date | hour | |
|---|---|---|---|---|---|---|---|---|---|---|
| count | 32135.000000 | 32135.000000 | 32122.000000 | 32122.000000 | 32135.000000 | 32135.000000 | 32135.000000 | 32135.000000 | 32135 | 32135.000000 |
| unique | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1339 | NaN |
| top | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2018-10-28 | NaN |
| freq | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 25 | NaN |
| mean | 28475.593527 | 6421.333431 | 2445.285163 | 2217.864703 | 5627.214688 | 1100.253546 | 601.186600 | 48.440119 | NaN | 11.500296 |
| std | 4686.675600 | 862.355391 | 623.215136 | 550.563300 | 3266.961919 | 1512.531532 | 681.584906 | 14.842233 | NaN | 6.922199 |
| min | 16372.000000 | 3672.800000 | 700.000000 | 200.000000 | 139.000000 | 0.000000 | 0.000000 | 0.030000 | NaN | 0.000000 |
| 25% | 24605.500000 | 6071.900000 | 2100.000000 | 1900.000000 | 3083.000000 | 0.000000 | 31.600000 | 39.770000 | NaN | 6.000000 |
| 50% | 28446.000000 | 7117.200000 | 2450.000000 | 2200.000000 | 4989.000000 | 107.800000 | 331.400000 | 49.880000 | NaN | 12.000000 |
| 75% | 32124.000000 | 7117.200000 | 2900.000000 | 2600.000000 | 7610.500000 | 2099.350000 | 957.150000 | 58.250000 | NaN | 17.500000 |
| max | 41103.000000 | 7117.200000 | 3700.000000 | 3700.000000 | 17232.000000 | 7211.200000 | 2253.700000 | 101.990000 | NaN | 23.000000 |
#Making use of the pandas profiling library to have a very comprehensive view of our dataset:
#from pandas_profiling import ProfileReport
#report = ProfileReport(pm_df, minimal=False)
#report
From the Pandas profiling, we can see that we have 10 variables, 9 of them numberic and 1 categorical. The total number of observations is 32135, with 26 values missing.
Interactions:
Pearson's Correlation:
Phik Correlation: The Phik correlation showed similar results to the Pearson's correlation, with some exceptions for solar energy:
pm_df.isnull().sum()
fc_demand 0 fc_nuclear 0 import_FR 13 export_FR 13 fc_wind 0 fc_solar_pv 0 fc_solar_th 0 price 0 date 0 hour 0 dtype: int64
#Filling with last value shown (Motifs: Small Number of missing values + demand should remain fairly stable from one day to the other)
pm_df.fillna(method='ffill', inplace=True)
pm_df.isnull().sum()
fc_demand 0 fc_nuclear 0 import_FR 0 export_FR 0 fc_wind 0 fc_solar_pv 0 fc_solar_th 0 price 0 date 0 hour 0 dtype: int64
pm_df_no_outliers = pm_df.copy()
def remove_outlier(df_in, col_name):
q1 = df_in[col_name].quantile(0.25)
q3 = df_in[col_name].quantile(0.75)
iqr = q3-q1 #Interquartile range
fence_low = q1-1.5*iqr
fence_high = q3+1.5*iqr
df_out = df_in.loc[(df_in[col_name] > fence_low) & (df_in[col_name] < fence_high)]
print("{} outliers removed".format(len(df_in)-len(df_out)))
return df_out
for (columnName, _) in pm_df.iteritems():
if pm_df_no_outliers[columnName].dtype in ['int64','float64']:
print("Analyzing outliers of column: {}".format(columnName))
plt.figure(figsize=(10,10))
pm_df_no_outliers.boxplot([columnName], grid=False, fontsize=15)
pm_df_no_outliers = remove_outlier(pm_df_no_outliers,columnName)
plt.show()
Analyzing outliers of column: fc_demand 0 outliers removed
Analyzing outliers of column: fc_nuclear 728 outliers removed
Analyzing outliers of column: import_FR 294 outliers removed
Analyzing outliers of column: export_FR 455 outliers removed
Analyzing outliers of column: fc_wind 342 outliers removed
Analyzing outliers of column: fc_solar_pv 763 outliers removed
Analyzing outliers of column: fc_solar_th 890 outliers removed
Analyzing outliers of column: price 591 outliers removed
Analyzing outliers of column: hour 0 outliers removed
len(pm_df_no_outliers)
28072
pm_df_clipped = pm_df.copy()
def clip_outliers(df_in, col_name):
q1 = df_in[col_name].quantile(0.25)
q3 = df_in[col_name].quantile(0.75)
iqr = q3-q1 #Interquartile range
fence_low = q1-1.5*iqr
fence_high = q3+1.5*iqr
print(str(len(df_in[(df_in[col_name] < fence_low)])+len(df_in[(df_in[col_name] > fence_high)]))+" outliers clipped")
df_in.at[(df_in[col_name] < fence_low),col_name]=fence_low
df_in.at[(df_in[col_name] > fence_high),col_name]=fence_high
return df_in
for (columnName, _) in pm_df_clipped.iteritems():
if pm_df_clipped[columnName].dtype in ['int64','float64']:
print("Analyzing outliers of column: {}".format(columnName))
plt.figure(figsize=(10,10))
pm_df_clipped.boxplot([columnName], grid=False, fontsize=15)
pm_df_clipped = clip_outliers(pm_df_clipped,columnName)
plt.show()
Analyzing outliers of column: fc_demand 0 outliers clipped
Analyzing outliers of column: fc_nuclear 728 outliers clipped
Analyzing outliers of column: import_FR 262 outliers clipped
Analyzing outliers of column: export_FR 539 outliers clipped
Analyzing outliers of column: fc_wind 347 outliers clipped
Analyzing outliers of column: fc_solar_pv 842 outliers clipped
Analyzing outliers of column: fc_solar_th 0 outliers clipped
Analyzing outliers of column: price 690 outliers clipped
Analyzing outliers of column: hour 0 outliers clipped
len(pm_df_clipped)
32135
#Choosing between Outlier Treatment (we tested for both and obtained better results for most models by clipping outlier values)
#Furthermore, clipping outliers allows us to preserve more data and time structure as well as make us of lagged variables in a more straightforward fashion.
#pm_df = pm_df_no_outliers
pm_df = pm_df_clipped
#Preliminary feature engeneering:
#Combining solar energies:
pm_df['fc_solar'] = pm_df['fc_solar_th'] + pm_df['fc_solar_pv']
#Creating the aforementioned Thermal Gap:
pm_df['Thermal_Gap'] = pm_df['fc_demand'] - pm_df['fc_nuclear'] - pm_df['fc_wind'] - pm_df['fc_solar']
#Creating Energy Trade Balance SPAIN/FR
pm_df['BoT_FR'] = pm_df['export_FR'] - pm_df['import_FR']
pm_df.head(5)
| fc_demand | fc_nuclear | import_FR | export_FR | fc_wind | fc_solar_pv | fc_solar_th | price | date | hour | fc_solar | Thermal_Gap | BoT_FR | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 24400.0 | 7117.2 | 3000.0 | 2600.0 | 1732.0 | 0.0 | 5.1 | 58.82 | 2017-01-01 | 0.0 | 5.1 | 15545.7 | -400.0 |
| 1 | 23616.0 | 7117.2 | 3000.0 | 2650.0 | 1826.0 | 0.0 | 0.6 | 58.23 | 2017-01-01 | 1.0 | 0.6 | 14672.2 | -350.0 |
| 2 | 21893.0 | 7117.2 | 3000.0 | 2650.0 | 1823.0 | 0.0 | 4.6 | 51.95 | 2017-01-01 | 2.0 | 4.6 | 12948.2 | -350.0 |
| 3 | 20693.0 | 7117.2 | 3000.0 | 2650.0 | 1777.0 | 0.0 | 9.7 | 47.27 | 2017-01-01 | 3.0 | 9.7 | 11789.1 | -350.0 |
| 4 | 19599.0 | 7117.2 | 3000.0 | 2650.0 | 1746.0 | 0.0 | 24.1 | 45.49 | 2017-01-01 | 4.0 | 24.1 | 10711.7 | -350.0 |
pm_df = pm_df.drop(columns = ['import_FR','export_FR','fc_solar_pv','fc_solar_th'], axis = 1)
pm_df.head(5)
| fc_demand | fc_nuclear | fc_wind | price | date | hour | fc_solar | Thermal_Gap | BoT_FR | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 24400.0 | 7117.2 | 1732.0 | 58.82 | 2017-01-01 | 0.0 | 5.1 | 15545.7 | -400.0 |
| 1 | 23616.0 | 7117.2 | 1826.0 | 58.23 | 2017-01-01 | 1.0 | 0.6 | 14672.2 | -350.0 |
| 2 | 21893.0 | 7117.2 | 1823.0 | 51.95 | 2017-01-01 | 2.0 | 4.6 | 12948.2 | -350.0 |
| 3 | 20693.0 | 7117.2 | 1777.0 | 47.27 | 2017-01-01 | 3.0 | 9.7 | 11789.1 | -350.0 |
| 4 | 19599.0 | 7117.2 | 1746.0 | 45.49 | 2017-01-01 | 4.0 | 24.1 | 10711.7 | -350.0 |
#Converting dataframe series "date" into datetime to use dt. functionalities:
pm_df['date'] = pd.to_datetime(pm_df['date'])
# Adding additional columns with year & day of the year:
pm_df['Year'] = pm_df['date'].dt.year
pm_df['Day_of_Year'] = pm_df['date'].dt.dayofyear
pm_df.head(5)
| fc_demand | fc_nuclear | fc_wind | price | date | hour | fc_solar | Thermal_Gap | BoT_FR | Year | Day_of_Year | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 24400.0 | 7117.2 | 1732.0 | 58.82 | 2017-01-01 | 0.0 | 5.1 | 15545.7 | -400.0 | 2017 | 1 |
| 1 | 23616.0 | 7117.2 | 1826.0 | 58.23 | 2017-01-01 | 1.0 | 0.6 | 14672.2 | -350.0 | 2017 | 1 |
| 2 | 21893.0 | 7117.2 | 1823.0 | 51.95 | 2017-01-01 | 2.0 | 4.6 | 12948.2 | -350.0 | 2017 | 1 |
| 3 | 20693.0 | 7117.2 | 1777.0 | 47.27 | 2017-01-01 | 3.0 | 9.7 | 11789.1 | -350.0 | 2017 | 1 |
| 4 | 19599.0 | 7117.2 | 1746.0 | 45.49 | 2017-01-01 | 4.0 | 24.1 | 10711.7 | -350.0 | 2017 | 1 |
pm_df['weekend'] = pm_df['date'].dt.dayofweek
pm_df.head(5)
| fc_demand | fc_nuclear | fc_wind | price | date | hour | fc_solar | Thermal_Gap | BoT_FR | Year | Day_of_Year | weekend | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 24400.0 | 7117.2 | 1732.0 | 58.82 | 2017-01-01 | 0.0 | 5.1 | 15545.7 | -400.0 | 2017 | 1 | 6 |
| 1 | 23616.0 | 7117.2 | 1826.0 | 58.23 | 2017-01-01 | 1.0 | 0.6 | 14672.2 | -350.0 | 2017 | 1 | 6 |
| 2 | 21893.0 | 7117.2 | 1823.0 | 51.95 | 2017-01-01 | 2.0 | 4.6 | 12948.2 | -350.0 | 2017 | 1 | 6 |
| 3 | 20693.0 | 7117.2 | 1777.0 | 47.27 | 2017-01-01 | 3.0 | 9.7 | 11789.1 | -350.0 | 2017 | 1 | 6 |
| 4 | 19599.0 | 7117.2 | 1746.0 | 45.49 | 2017-01-01 | 4.0 | 24.1 | 10711.7 | -350.0 | 2017 | 1 | 6 |
#defining function to assign to value 5,6 in weekdays value 1 for weekend
def weekend(x):
if x in [5,6]:
return 1
else:
return 0
#Replacing values in column weekdays with 1 and 0 values
pm_df['weekend'].replace({1:0})
0 6
1 6
2 6
3 6
4 6
..
32130 0
32131 0
32132 0
32133 0
32134 0
Name: weekend, Length: 32135, dtype: int64
#applying the function weekend to the column weekdays in our dataset
pm_df['weekend'] = pm_df['weekend'].apply(weekend)
pm_df.head(5)
| fc_demand | fc_nuclear | fc_wind | price | date | hour | fc_solar | Thermal_Gap | BoT_FR | Year | Day_of_Year | weekend | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 24400.0 | 7117.2 | 1732.0 | 58.82 | 2017-01-01 | 0.0 | 5.1 | 15545.7 | -400.0 | 2017 | 1 | 1 |
| 1 | 23616.0 | 7117.2 | 1826.0 | 58.23 | 2017-01-01 | 1.0 | 0.6 | 14672.2 | -350.0 | 2017 | 1 | 1 |
| 2 | 21893.0 | 7117.2 | 1823.0 | 51.95 | 2017-01-01 | 2.0 | 4.6 | 12948.2 | -350.0 | 2017 | 1 | 1 |
| 3 | 20693.0 | 7117.2 | 1777.0 | 47.27 | 2017-01-01 | 3.0 | 9.7 | 11789.1 | -350.0 | 2017 | 1 | 1 |
| 4 | 19599.0 | 7117.2 | 1746.0 | 45.49 | 2017-01-01 | 4.0 | 24.1 | 10711.7 | -350.0 | 2017 | 1 | 1 |
# Creating a new column, to determine whether the day is a public holiday
# First checkin min and max date to understand the exact range of dates we have to look at:
print(pm_df.date.min())
print(pm_df.date.max())
# So now tedious work time! Time to identify public holidays in Spain from jan 2017 to sept 2020
# Sources:
# 2017: https://calendarios.ideal.es/2017
# 2018: https://calendarios.ideal.es/2018
# 2019: https://calendarios.ideal.es/2019
# 2020: https://calendarios.ideal.es/2020
festivos = ['2017-01-06','2017-03-13','2017-03-02','2017-03-14','2017-05-01','2017-08-15','2017-10-12','2017-11-01','2017-12-06','2017-12-08','2017-12-25','2018-01-01','2018-01-06','2018-03-30','2018-04-02','2018-05-01','2018-08-15','2018-10-12','2018-11-01','2018-12-06','2018-12-08','2018-12-25','2019-01-01','2019-04-02','2019-04-19','2019-05-01','2019-08-15','2019-10-12','2019-11-01','2019-12-06','2019-12-25','2020-01-01','2020-01-06','2020-04-02','2020-04-10','2020-05-01','2020-08-15']
festivosdate = pd.to_datetime(festivos)
print(festivosdate)
2017-01-01 00:00:00
2020-08-31 00:00:00
DatetimeIndex(['2017-01-06', '2017-03-13', '2017-03-02', '2017-03-14',
'2017-05-01', '2017-08-15', '2017-10-12', '2017-11-01',
'2017-12-06', '2017-12-08', '2017-12-25', '2018-01-01',
'2018-01-06', '2018-03-30', '2018-04-02', '2018-05-01',
'2018-08-15', '2018-10-12', '2018-11-01', '2018-12-06',
'2018-12-08', '2018-12-25', '2019-01-01', '2019-04-02',
'2019-04-19', '2019-05-01', '2019-08-15', '2019-10-12',
'2019-11-01', '2019-12-06', '2019-12-25', '2020-01-01',
'2020-01-06', '2020-04-02', '2020-04-10', '2020-05-01',
'2020-08-15'],
dtype='datetime64[ns]', freq=None)
pm_df['holiday'] = pm_df['date'].isin(festivosdate)
pm_df.holiday.unique()
array([False, True])
# That however returs True vs False, which is harder to use in a model, so we are recoding them to 1 and 0:
def holiday(x):
if x == True:
return 1
else:
return 0
pm_df['holiday'] = pm_df['holiday'].apply(holiday)
# Checking if the function was implemented correctly
pm_df.sort_values('holiday')
| fc_demand | fc_nuclear | fc_wind | price | date | hour | fc_solar | Thermal_Gap | BoT_FR | Year | Day_of_Year | weekend | holiday | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 24400.0 | 7117.2 | 1732.0 | 58.82 | 2017-01-01 | 0.0 | 5.1 | 15545.7 | -400.0 | 2017 | 1 | 1 | 0 |
| 21318 | 22141.0 | 6022.1 | 3559.0 | 43.99 | 2019-06-08 | 7.0 | 498.9 | 12061.0 | 800.0 | 2019 | 159 | 1 | 0 |
| 21317 | 21621.0 | 6022.1 | 3335.0 | 43.57 | 2019-06-08 | 6.0 | 396.6 | 11867.3 | 800.0 | 2019 | 159 | 1 | 0 |
| 21316 | 21738.0 | 5797.1 | 3150.0 | 43.53 | 2019-06-08 | 5.0 | 527.8 | 12263.1 | 800.0 | 2019 | 159 | 1 | 0 |
| 21315 | 22027.0 | 5797.1 | 3114.0 | 43.58 | 2019-06-08 | 4.0 | 583.7 | 12532.2 | 800.0 | 2019 | 159 | 1 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 17371 | 25410.0 | 5976.1 | 2396.0 | 66.70 | 2018-12-25 | 19.0 | 16.0 | 17021.9 | -950.0 | 2018 | 359 | 0 | 1 |
| 17372 | 26380.0 | 5976.1 | 2400.0 | 67.65 | 2018-12-25 | 20.0 | 9.0 | 17994.9 | -950.0 | 2018 | 359 | 0 | 1 |
| 17373 | 26830.0 | 5976.1 | 2556.0 | 68.70 | 2018-12-25 | 21.0 | 8.5 | 18289.4 | 200.0 | 2018 | 359 | 0 | 1 |
| 17365 | 24700.0 | 5776.1 | 1409.0 | 66.58 | 2018-12-25 | 13.0 | 2155.7 | 15359.2 | 1150.0 | 2018 | 359 | 0 | 1 |
| 16067 | 27551.0 | 6105.9 | 3194.0 | 64.46 | 2018-11-01 | 11.0 | 3828.1 | 14423.0 | 350.0 | 2018 | 305 | 0 | 1 |
32135 rows × 13 columns
#Adding a feature for COVID_PERIOD with values 0 and 1 translating whether the day was during Covid-19 lockdown period or not
#The first lockdown in Spain started on 14-March-2020
import datetime
pm_df['covid_period'] = pm_df.apply(lambda row:1 if row['date']>=datetime.date(2020, 3, 14) else 0,axis =1)
#Additionally, we also decided to create a variable to model the hard lockdown since it is likely energy prices fluctuated more due to the temporary ceasure of many business activties.
pm_df['hard_lockdown'] = pm_df.apply(lambda row:1 if datetime.date(2020, 3, 14) <= row['date'] <= datetime.date(2020, 6, 21) else 0,axis =1)
#Computing price_lags:
pm_df['price_lag_hourly']=pm_df['price'].diff()
pm_df['price_lag_daily']=pm_df['price'].diff(periods=24)
#Compute Thermal_Gap_lags (we are unable to use price lags given the scoring dataset so decided to try and include Thermal_gap
#lags to capture the effect of past information on price since this variable is highly correlated with price):
pm_df['thermal_gap_lag_hourly']=pm_df['Thermal_Gap'].diff()
pm_df['thermal_gap_daily']=pm_df['Thermal_Gap'].diff(periods=24)
pm_df = pm_df.drop(columns = ['date'], axis = 1)
pm_df.head(12)
| fc_demand | fc_nuclear | fc_wind | price | hour | fc_solar | Thermal_Gap | BoT_FR | Year | Day_of_Year | weekend | holiday | covid_period | hard_lockdown | price_lag_hourly | price_lag_daily | thermal_gap_lag_hourly | thermal_gap_daily | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 24400.0 | 7117.2 | 1732.0 | 58.82 | 0.0 | 5.1 | 15545.7 | -400.0 | 2017 | 1 | 1 | 0 | 0 | 0 | NaN | NaN | NaN | NaN |
| 1 | 23616.0 | 7117.2 | 1826.0 | 58.23 | 1.0 | 0.6 | 14672.2 | -350.0 | 2017 | 1 | 1 | 0 | 0 | 0 | -0.59 | NaN | -873.5 | NaN |
| 2 | 21893.0 | 7117.2 | 1823.0 | 51.95 | 2.0 | 4.6 | 12948.2 | -350.0 | 2017 | 1 | 1 | 0 | 0 | 0 | -6.28 | NaN | -1724.0 | NaN |
| 3 | 20693.0 | 7117.2 | 1777.0 | 47.27 | 3.0 | 9.7 | 11789.1 | -350.0 | 2017 | 1 | 1 | 0 | 0 | 0 | -4.68 | NaN | -1159.1 | NaN |
| 4 | 19599.0 | 7117.2 | 1746.0 | 45.49 | 4.0 | 24.1 | 10711.7 | -350.0 | 2017 | 1 | 1 | 0 | 0 | 0 | -1.78 | NaN | -1077.4 | NaN |
| 5 | 19211.0 | 7117.2 | 1662.0 | 44.50 | 5.0 | 30.4 | 10401.4 | -350.0 | 2017 | 1 | 1 | 0 | 0 | 0 | -0.99 | NaN | -310.3 | NaN |
| 6 | 19314.0 | 7117.2 | 1684.0 | 44.50 | 6.0 | 40.0 | 10472.8 | -350.0 | 2017 | 1 | 1 | 0 | 0 | 0 | 0.00 | NaN | 71.4 | NaN |
| 7 | 19538.0 | 7117.2 | 1780.0 | 44.72 | 7.0 | 45.5 | 10595.3 | -350.0 | 2017 | 1 | 1 | 0 | 0 | 0 | 0.22 | NaN | 122.5 | NaN |
| 8 | 19651.0 | 7117.2 | 1803.0 | 44.22 | 8.0 | 99.7 | 10631.1 | -350.0 | 2017 | 1 | 1 | 0 | 0 | 0 | -0.50 | NaN | 35.8 | NaN |
| 9 | 20066.0 | 7117.2 | 1737.0 | 45.13 | 9.0 | 562.9 | 10648.9 | -350.0 | 2017 | 1 | 1 | 0 | 0 | 0 | 0.91 | NaN | 17.8 | NaN |
| 10 | 22001.0 | 7117.2 | 1562.0 | 46.23 | 10.0 | 1377.3 | 11944.5 | -350.0 | 2017 | 1 | 1 | 0 | 0 | 0 | 1.10 | NaN | 1295.6 | NaN |
| 11 | 23417.0 | 7117.2 | 1433.0 | 47.91 | 11.0 | 2056.2 | 12810.6 | -350.0 | 2017 | 1 | 1 | 0 | 0 | 0 | 1.68 | NaN | 866.1 | NaN |
pm_df = pm_df.iloc[24:] #dropping first rows of thermal_gap_hourly & price_lag_hourly and drop first 24 rows of price_lag_daily , thermal_gap_daily since they're Null Values.
From the Pandas profiling, we can see that, after feature engineering, we have 19 variables, 14 of them numberic and 5 categorical. The total number of observations is now 28048, with 0 values missing.
Interactions:
Pearson's Correlation:
#Creating a dataset with all variables standardized ((X-mu)/sigma)) in order to feed it to specific algorithms that require such
#pre-processing procedure (e.g. SVR).
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(pm_df)
standard_pm_df = scaler.transform(pm_df)
standard_pm_df = pd.DataFrame(data = standard_pm_df, index = None , columns = pm_df.columns)
standard_pm_df.head(12)
| fc_demand | fc_nuclear | fc_wind | price | hour | fc_solar | Thermal_Gap | BoT_FR | Year | Day_of_Year | weekend | holiday | covid_period | hard_lockdown | price_lag_hourly | price_lag_daily | thermal_gap_lag_hourly | thermal_gap_daily | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -0.290698 | 0.819699 | -0.513498 | 0.425968 | -1.661390 | -0.832513 | 0.256123 | -0.237145 | -1.279716 | -1.651612 | -0.632421 | -0.168708 | -0.382909 | -0.284266 | -1.353382 | -0.040246 | -2.071042 | 1.016536 |
| 1 | -0.786961 | 0.819699 | -0.524298 | 0.292467 | -1.516925 | -0.832314 | -0.200244 | -0.168852 | -1.279716 | -1.651612 | -0.632421 | -0.168708 | -0.382909 | -0.284266 | -0.640306 | 0.349460 | -1.620371 | 1.028132 |
| 2 | -1.106277 | 0.819699 | -0.555464 | 0.052301 | -1.372460 | -0.832314 | -0.478200 | -0.168852 | -1.279716 | -1.651612 | -0.632421 | -0.168708 | -0.382909 | -0.284266 | -1.152004 | 0.293788 | -0.986886 | 0.954114 |
| 3 | -1.227301 | 0.819699 | -0.524298 | -0.240163 | -1.227995 | -0.832613 | -0.611181 | -0.168852 | -1.279716 | -1.651612 | -0.632421 | -0.168708 | -0.382909 | -0.284266 | -1.402902 | -0.104065 | -0.472117 | 0.866908 |
| 4 | -1.267429 | 0.819699 | -0.598664 | -0.227777 | -1.083530 | -0.832015 | -0.600860 | -0.168852 | -1.279716 | -1.651612 | -0.632421 | -0.168708 | -0.382909 | -0.284266 | 0.059565 | 0.027647 | 0.036709 | 0.845433 |
| 5 | -1.134025 | 0.819699 | -0.641863 | 0.052301 | -0.939065 | -0.832164 | -0.448372 | -0.168852 | -1.279716 | -1.651612 | -0.632421 | -0.168708 | -0.382909 | -0.284266 | 1.343762 | 0.320945 | 0.541502 | 0.842919 |
| 6 | -0.651850 | 0.819699 | -0.603601 | 0.630350 | -0.794600 | -0.832314 | -0.022909 | -0.783483 | -1.279716 | -1.651612 | -0.632421 | -0.168708 | -0.382909 | -0.284266 | 2.773216 | 1.008023 | 1.510762 | 0.783909 |
| 7 | 0.142384 | 0.819699 | -0.596195 | 1.515314 | -0.650135 | -0.832164 | 0.713666 | -0.237145 | -1.279716 | -1.651612 | -0.632421 | -0.168708 | -0.382909 | -0.284266 | 4.245586 | 1.555241 | 2.615435 | 0.903930 |
| 8 | 0.837152 | 0.819699 | -0.673646 | 1.591011 | -0.505670 | -0.808841 | 1.402919 | -0.237145 | -1.279716 | -1.651612 | -0.632421 | -0.168708 | -0.382909 | -0.284266 | 0.363282 | 0.902110 | 2.447406 | 1.199773 |
| 9 | 1.348996 | 0.819699 | -0.735668 | 1.665331 | -0.361205 | -0.626546 | 1.847889 | -0.237145 | -1.279716 | -1.651612 | -0.632421 | -0.168708 | -0.382909 | -0.284266 | 0.356680 | 1.043327 | 1.580025 | 1.261811 |
| 10 | 1.652517 | 0.819699 | -0.743691 | 1.709373 | -0.216740 | -0.351359 | 2.026379 | -0.237145 | -1.279716 | -1.651612 | -0.632421 | -0.168708 | -0.382909 | -0.284266 | 0.211424 | 0.659053 | 0.633830 | 1.082652 |
| 11 | 1.628184 | 0.819699 | -0.880079 | 1.707996 | -0.072275 | -0.101238 | 1.991729 | -0.237145 | -1.279716 | -1.651612 | -0.632421 | -0.168708 | -0.382909 | -0.284266 | -0.006461 | 0.995802 | -0.122971 | 1.022462 |
len(standard_pm_df)
32087
standard_pm_df.head(5)
| fc_demand | fc_nuclear | fc_wind | price | hour | fc_solar | Thermal_Gap | BoT_FR | Year | Day_of_Year | weekend | holiday | covid_period | hard_lockdown | price_lag_hourly | price_lag_daily | thermal_gap_lag_hourly | thermal_gap_daily | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -0.290698 | 0.819699 | -0.513498 | 0.425968 | -1.661390 | -0.832513 | 0.256123 | -0.237145 | -1.279716 | -1.651612 | -0.632421 | -0.168708 | -0.382909 | -0.284266 | -1.353382 | -0.040246 | -2.071042 | 1.016536 |
| 1 | -0.786961 | 0.819699 | -0.524298 | 0.292467 | -1.516925 | -0.832314 | -0.200244 | -0.168852 | -1.279716 | -1.651612 | -0.632421 | -0.168708 | -0.382909 | -0.284266 | -0.640306 | 0.349460 | -1.620371 | 1.028132 |
| 2 | -1.106277 | 0.819699 | -0.555464 | 0.052301 | -1.372460 | -0.832314 | -0.478200 | -0.168852 | -1.279716 | -1.651612 | -0.632421 | -0.168708 | -0.382909 | -0.284266 | -1.152004 | 0.293788 | -0.986886 | 0.954114 |
| 3 | -1.227301 | 0.819699 | -0.524298 | -0.240163 | -1.227995 | -0.832613 | -0.611181 | -0.168852 | -1.279716 | -1.651612 | -0.632421 | -0.168708 | -0.382909 | -0.284266 | -1.402902 | -0.104065 | -0.472117 | 0.866908 |
| 4 | -1.267429 | 0.819699 | -0.598664 | -0.227777 | -1.083530 | -0.832015 | -0.600860 | -0.168852 | -1.279716 | -1.651612 | -0.632421 | -0.168708 | -0.382909 | -0.284266 | 0.059565 | 0.027647 | 0.036709 | 0.845433 |
Based on the previous analysis about the variable's correlation matrix, we decided to consider ['Thermal_Gap', 'fc_wind' , 'fc_solar', 'BoT_FR' , 'holiday' , 'hour', 'weekend' , 'thermal_gap_daily', 'covid_period', 'hard_lockdown']] as preliminary features for our model.
from sklearn.model_selection import train_test_split , TimeSeriesSplit
# price is the target variable
X = pm_df.loc[:,['Thermal_Gap', 'fc_wind' , 'fc_solar', 'BoT_FR' , 'holiday' , 'hour', 'weekend' , 'thermal_gap_daily', 'covid_period', 'hard_lockdown']]
y = pm_df['price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42 , shuffle=False) #shuffle = False means we remove data in orderly fashion
X_test
| Thermal_Gap | fc_wind | fc_solar | BoT_FR | holiday | hour | weekend | thermal_gap_daily | covid_period | hard_lockdown | |
|---|---|---|---|---|---|---|---|---|---|---|
| 25717 | 15475.2 | 3229.0 | 3020.9 | 0.0 | 0 | 13.0 | 1 | -2623.0 | 0 | 0 |
| 25718 | 15523.7 | 3570.0 | 2720.4 | 0.0 | 0 | 14.0 | 1 | -2245.7 | 0 | 0 |
| 25719 | 14023.8 | 4115.0 | 2312.3 | 0.0 | 0 | 15.0 | 1 | -2899.2 | 0 | 0 |
| 25720 | 14061.3 | 4394.0 | 1399.8 | 0.0 | 0 | 16.0 | 1 | -3263.9 | 0 | 0 |
| 25721 | 14875.6 | 4956.0 | 391.5 | 0.0 | 0 | 17.0 | 1 | -4629.3 | 0 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 32130 | 15218.4 | 2846.0 | 3650.4 | -450.0 | 0 | 19.0 | 0 | 9953.1 | 1 | 0 |
| 32131 | 18131.7 | 2861.0 | 1343.1 | -450.0 | 0 | 20.0 | 0 | 10280.9 | 1 | 0 |
| 32132 | 20026.3 | 2859.0 | 682.5 | -450.0 | 0 | 21.0 | 0 | 9197.1 | 1 | 0 |
| 32133 | 18212.5 | 2771.0 | 598.3 | -450.0 | 0 | 22.0 | 0 | 7631.9 | 1 | 0 |
| 32134 | 16191.0 | 2746.0 | 617.8 | -450.0 | 0 | 23.0 | 0 | 6751.7 | 1 | 0 |
6418 rows × 10 columns
# Infering number of splits --> Attempt of adjusting our models predictions on a daily basis. This "learning as you go" approach can sometimes lead to some data leakage.
required_splits = (len(pm_df.index)/24)
print(required_splits)
1336.9583333333333
tss = TimeSeriesSplit(n_splits = 1338)
print(tss)
TimeSeriesSplit(max_train_size=None, n_splits=1338)
for train_index, test_index in tss.split(X):
Tss_X_train, Tss_X_test = X.iloc[train_index, :], X.iloc[test_index,:]
Tss_y_train, Tss_y_test = y.iloc[train_index], y.iloc[test_index]
Tss_X_test
| Thermal_Gap | fc_wind | fc_solar | BoT_FR | holiday | hour | weekend | thermal_gap_daily | covid_period | hard_lockdown | |
|---|---|---|---|---|---|---|---|---|---|---|
| 32112 | 6915.100 | 6601.0 | 545.700 | -300.0 | 0 | 1.0 | 0 | 1082.3 | 1 | 0 |
| 32113 | 6539.000 | 6259.0 | 455.800 | -300.0 | 0 | 2.0 | 0 | 1057.9 | 1 | 0 |
| 32114 | 6410.400 | 5886.0 | 468.400 | -300.0 | 0 | 3.0 | 0 | 996.9 | 1 | 0 |
| 32115 | 7190.200 | 5382.0 | 405.600 | -300.0 | 0 | 4.0 | 0 | 1748.5 | 1 | 0 |
| 32116 | 8028.000 | 5193.0 | 313.800 | -300.0 | 0 | 5.0 | 0 | 2162.0 | 1 | 0 |
| 32117 | 10326.600 | 4996.0 | 213.200 | -300.0 | 0 | 6.0 | 0 | 4006.2 | 1 | 0 |
| 32118 | 12575.400 | 4720.0 | 137.400 | -300.0 | 0 | 7.0 | 0 | 5880.8 | 1 | 0 |
| 32119 | 14118.900 | 4225.0 | 1163.900 | -450.0 | 0 | 8.0 | 0 | 7820.6 | 1 | 0 |
| 32120 | 13816.300 | 3460.0 | 3936.500 | -450.0 | 0 | 9.0 | 0 | 8299.8 | 1 | 0 |
| 32121 | 12011.500 | 3471.0 | 6826.300 | -450.0 | 0 | 10.0 | 0 | 8136.7 | 1 | 0 |
| 32122 | 12664.125 | 3258.0 | 7207.675 | -450.0 | 0 | 11.0 | 0 | 9046.1 | 1 | 0 |
| 32123 | 13616.225 | 2905.0 | 7311.575 | -450.0 | 0 | 12.0 | 0 | 9315.5 | 1 | 0 |
| 32124 | 14378.425 | 2696.0 | 7348.375 | -450.0 | 0 | 13.0 | 0 | 9387.7 | 1 | 0 |
| 32125 | 14202.525 | 2596.0 | 7292.275 | -450.0 | 0 | 14.0 | 0 | 9095.3 | 1 | 0 |
| 32126 | 13228.925 | 2548.0 | 7147.875 | -450.0 | 0 | 15.0 | 0 | 8719.2 | 1 | 0 |
| 32127 | 12760.625 | 2508.0 | 7269.175 | -450.0 | 0 | 16.0 | 0 | 9057.6 | 1 | 0 |
| 32128 | 12551.625 | 2606.0 | 7298.175 | -450.0 | 0 | 17.0 | 0 | 9643.0 | 1 | 0 |
| 32129 | 13407.500 | 2716.0 | 6094.300 | -450.0 | 0 | 18.0 | 0 | 9889.5 | 1 | 0 |
| 32130 | 15218.400 | 2846.0 | 3650.400 | -450.0 | 0 | 19.0 | 0 | 9953.1 | 1 | 0 |
| 32131 | 18131.700 | 2861.0 | 1343.100 | -450.0 | 0 | 20.0 | 0 | 10280.9 | 1 | 0 |
| 32132 | 20026.300 | 2859.0 | 682.500 | -450.0 | 0 | 21.0 | 0 | 9197.1 | 1 | 0 |
| 32133 | 18212.500 | 2771.0 | 598.300 | -450.0 | 0 | 22.0 | 0 | 7631.9 | 1 | 0 |
| 32134 | 16191.000 | 2746.0 | 617.800 | -450.0 | 0 | 23.0 | 0 | 6751.7 | 1 | 0 |
#This method helps us make predictions on a rolling basis. The train/test % split and selected amount of splits will determine
#the size of our rolling window (e.g. predicting the next 24 hours with previous 24 hours of data. )
class BlockingTimeSeriesSplit():
def __init__(self, n_splits):
self.n_splits = n_splits
def get_n_splits(self, X, y, groups):
return self.n_splits
def split(self, X, y=None, groups=None):
n_samples = len(X)
k_fold_size = n_samples // self.n_splits
indices = np.arange(n_samples)
margin = 0
for i in range(self.n_splits):
start = i * k_fold_size
stop = start + k_fold_size
mid = int(0.6667 * (stop - start)) + start #splitting 66.7% train / 33.3% test this can be changed according to the desired rationale /number of splits
yield indices[start: mid], indices[mid + margin: stop]
#Several Options were trying based on the price mechanics of the energy market:
#btss_required_splits = (len(pm_df.index)/(24)) #we wanna predict the next 5 hours based on the 19 hours that preceed it (80/20 split)
#btss_required_splits = (len(pm_df.index)/(24 * 2)) #we wanna predict the next 24 hours based on the 24hours that preceed it. (50/50 split)
btss_required_splits = (len(pm_df.index)/(24 * 3)) #we wanna predict the next 24 hours based on the 48hours that preceed it. (66.7/33/3 split)
#btss_required_splits = (len(pm_df.index)/(24 * 4)) #we wanna predict the next 24 hours based on the 3 days of data that preceed it. (75/25 split)
#btss_required_splits = (len(pm_df.index)/(24 * 5)) #we wanna predict the next 24 hours based on the 4 days of data that preceed it. (80/20 split)
#btss_required_splits = (len(pm_df.index)/(24 * 8)) #we wanna predict the next 24 hours based on the week of data that preceeds it. (87.5/12.5 split)
print(btss_required_splits)
445.65277777777777
btss = BlockingTimeSeriesSplit(n_splits = 446)
for train_index, test_index in btss.split(X):
Btss_X_train, Btss_X_test = X.iloc[train_index, :], X.iloc[test_index,:]
Btss_y_train, Btss_y_test = y.iloc[train_index], y.iloc[test_index]
Btss_X_test
| Thermal_Gap | fc_wind | fc_solar | BoT_FR | holiday | hour | weekend | thermal_gap_daily | covid_period | hard_lockdown | |
|---|---|---|---|---|---|---|---|---|---|---|
| 31690 | 16850.125 | 424.0 | 7152.675 | -200.0 | 0 | 11.0 | 0 | 2493.300 | 1 | 0 |
| 31691 | 17801.925 | 499.0 | 7317.875 | -200.0 | 0 | 12.0 | 0 | 3123.900 | 1 | 0 |
| 31692 | 18553.425 | 548.0 | 7360.375 | -200.0 | 0 | 13.0 | 0 | 3484.300 | 1 | 0 |
| 31693 | 17855.425 | 724.0 | 7311.375 | -200.0 | 0 | 14.0 | 0 | 3476.300 | 1 | 0 |
| 31694 | 16393.925 | 972.0 | 7365.875 | -200.0 | 0 | 15.0 | 0 | 3854.900 | 1 | 0 |
| 31695 | 16012.425 | 1200.0 | 7352.375 | -200.0 | 0 | 16.0 | 0 | 4857.300 | 1 | 0 |
| 31696 | 15542.725 | 1556.0 | 7321.075 | -200.0 | 0 | 17.0 | 0 | 5215.125 | 1 | 0 |
| 31697 | 14880.100 | 2023.0 | 6846.700 | -200.0 | 0 | 18.0 | 0 | 3499.700 | 1 | 0 |
| 31698 | 16074.200 | 2251.0 | 4885.600 | -200.0 | 0 | 19.0 | 0 | 2886.200 | 1 | 0 |
| 31699 | 18455.100 | 2286.0 | 2151.700 | -200.0 | 0 | 20.0 | 0 | 2585.100 | 1 | 0 |
| 31700 | 20533.900 | 2238.0 | 846.900 | -200.0 | 0 | 21.0 | 0 | 2111.100 | 1 | 0 |
| 31701 | 19434.000 | 2297.0 | 624.800 | -200.0 | 0 | 22.0 | 0 | 1503.200 | 1 | 0 |
| 31702 | 17145.300 | 2493.0 | 610.500 | -200.0 | 0 | 23.0 | 0 | 459.600 | 1 | 0 |
| 31703 | 15143.700 | 2797.0 | 613.100 | -200.0 | 0 | 0.0 | 0 | -334.600 | 1 | 0 |
| 31704 | 13652.700 | 2789.0 | 587.100 | -200.0 | 0 | 1.0 | 0 | -918.700 | 1 | 0 |
| 31705 | 12730.100 | 2586.0 | 550.700 | -200.0 | 0 | 2.0 | 0 | -1059.700 | 1 | 0 |
| 31706 | 12013.900 | 2655.0 | 532.900 | -200.0 | 0 | 3.0 | 0 | -1566.900 | 1 | 0 |
| 31707 | 12010.400 | 2377.0 | 506.400 | -200.0 | 0 | 4.0 | 0 | -1291.400 | 1 | 0 |
| 31708 | 12576.600 | 2030.0 | 418.200 | -200.0 | 0 | 5.0 | 0 | -976.000 | 1 | 0 |
| 31709 | 14242.800 | 1855.0 | 271.000 | -200.0 | 0 | 6.0 | 0 | -575.700 | 1 | 0 |
| 31710 | 15615.500 | 1632.0 | 317.300 | -200.0 | 0 | 7.0 | 0 | -488.200 | 1 | 0 |
| 31711 | 16504.300 | 1297.0 | 1478.500 | -200.0 | 0 | 8.0 | 0 | -474.600 | 1 | 0 |
| 31712 | 16208.800 | 867.0 | 4332.000 | -200.0 | 0 | 9.0 | 0 | -709.100 | 1 | 0 |
| 31713 | 15236.900 | 735.0 | 6917.900 | -200.0 | 0 | 10.0 | 0 | -768.400 | 1 | 0 |
Btss_X_train.head(5)
| Thermal_Gap | fc_wind | fc_solar | BoT_FR | holiday | hour | weekend | thermal_gap_daily | covid_period | hard_lockdown | |
|---|---|---|---|---|---|---|---|---|---|---|
| 31643 | 15352.4 | 5633.0 | 5149.4 | -200.0 | 0 | 12.0 | 0 | -481.1 | 1 | 0 |
| 31644 | 15538.0 | 6189.0 | 5461.8 | -200.0 | 0 | 13.0 | 0 | -128.5 | 1 | 0 |
| 31645 | 13880.4 | 7070.0 | 5903.4 | -200.0 | 0 | 14.0 | 0 | -1962.3 | 1 | 0 |
| 31646 | 11811.3 | 8375.0 | 5536.5 | -200.0 | 0 | 15.0 | 0 | -2350.3 | 1 | 0 |
| 31647 | 13065.6 | 7480.0 | 4747.2 | -200.0 | 0 | 16.0 | 0 | -951.4 | 1 | 0 |
#Function to visualize feature impotance:
def get_feature_importance(model, feature_names):
return pd.DataFrame({'variable': feature_names, # Feature names
'coefficient': model.coef_[0:] # Feature Coeficients
}) \
.round(decimals=2) \
.sort_values('coefficient', ascending=False) \
.style.bar(color=['red', 'green'], align='zero')
#Function to display Cross-Validation Scores
def display_scores(score):
print("Scores:", scores)
print("Mean", scores.mean())
print("Standard Deviation:", scores.std())
from sklearn import datasets, linear_model
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
# Create linear regression object
regressor = LinearRegression()
# Train the model using the training sets
regressor.fit(X_train, y_train)
# Make predictions using the testing set
y_pred = regressor.predict(X_test)
# The coefficients
print('Coefficients: \n', regressor.coef_)
# The coefficient of determination: R-squared
print('Train R-Squared: %.2f' % regressor.score(X_train, y_train))
print('Test R-Squared: %.2f' % r2_score(y_test, y_pred))
# RMSE:
print('Test RMSE: %.2f' % mean_squared_error(y_test, y_pred, squared=False))
Coefficients: [ 1.82323732e-03 -3.14519201e-07 -5.97177059e-04 -7.72134555e-04 1.08557367e+00 -5.23101340e-02 2.10839715e+00 -9.82361376e-05 0.00000000e+00 0.00000000e+00] Train R-Squared: 0.51 Test R-Squared: -2.10 Test RMSE: 17.77
# Defining Table to Summarize Results:
results_df = pd.DataFrame(columns=['Algorithm', 'RMSE'])
results_df.loc[len(results_df)] = ['LR Baseline', mean_squared_error(y_test,regressor.predict(X_test) , squared=False)]
results_df
| Algorithm | RMSE | |
|---|---|---|
| 0 | LR Baseline | 17.767744 |
get_feature_importance(regressor, X_train.columns) #these coefficients cannot be compared with each other given data is not standardized.
| variable | coefficient | |
|---|---|---|
| 6 | weekend | 2.110000 |
| 4 | holiday | 1.090000 |
| 0 | Thermal_Gap | 0.000000 |
| 1 | fc_wind | -0.000000 |
| 2 | fc_solar | -0.000000 |
| 3 | BoT_FR | -0.000000 |
| 7 | thermal_gap_daily | -0.000000 |
| 8 | covid_period | 0.000000 |
| 9 | hard_lockdown | 0.000000 |
| 5 | hour | -0.050000 |
df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
df.head(5)
| Actual | Predicted | |
|---|---|---|
| 25717 | 42.01 | 53.837649 |
| 25718 | 42.00 | 54.016046 |
| 25719 | 38.11 | 51.536796 |
| 25720 | 38.05 | 52.133520 |
| 25721 | 41.10 | 54.301961 |
# Create linear regression object
regressor = LinearRegression(normalize = False)
# Train the model using the training sets
regressor.fit(Tss_X_train, Tss_y_train)
# Make predictions using the testing set
Tss_y_pred = regressor.predict(Tss_X_test)
# The coefficients
print('Coefficients: \n', regressor.coef_)
# The coefficient of determination: R-squared
print('Train R-Squared: %.2f' % regressor.score(Tss_X_train, Tss_y_train))
print('Test R-Squared: %.2f' % r2_score(Tss_y_test, Tss_y_pred))
# RMSE:
print('Test RMSE: %.2f' % mean_squared_error(Tss_y_test, Tss_y_pred, squared=False))
Coefficients: [ 1.88741820e-03 4.94290759e-05 -2.70561258e-04 1.93756139e-04 1.06551112e+00 -5.70424824e-02 1.76398908e+00 -1.33396902e-04 -1.23647451e+01 -6.92675729e+00] Train R-Squared: 0.64 Test R-Squared: -1.14 Test RMSE: 10.47
results_df.loc[len(results_df)] = ['LR Baseline (TSS)', mean_squared_error(Tss_y_test,regressor.predict(Tss_X_test) , squared=False)]
results_df
| Algorithm | RMSE | |
|---|---|---|
| 0 | LR Baseline | 17.767744 |
| 1 | LR Baseline (TSS) | 10.468930 |
get_feature_importance(regressor, Tss_X_train.columns)
| variable | coefficient | |
|---|---|---|
| 6 | weekend | 1.760000 |
| 4 | holiday | 1.070000 |
| 0 | Thermal_Gap | 0.000000 |
| 1 | fc_wind | 0.000000 |
| 2 | fc_solar | -0.000000 |
| 3 | BoT_FR | 0.000000 |
| 7 | thermal_gap_daily | -0.000000 |
| 5 | hour | -0.060000 |
| 9 | hard_lockdown | -6.930000 |
| 8 | covid_period | -12.360000 |
Tss_df = pd.DataFrame({'Actual': Tss_y_test, 'Predicted': Tss_y_pred})
Tss_df.head(5)
| Actual | Predicted | |
|---|---|---|
| 32112 | 30.82 | 23.696823 |
| 32113 | 28.89 | 22.940596 |
| 32114 | 28.43 | 22.627123 |
| 32115 | 29.41 | 23.933707 |
| 32116 | 34.39 | 25.418279 |
# Train the model using the training sets
regressor.fit(Btss_X_train, Btss_y_train)
# Make predictions using the testing set
Btss_y_pred = regressor.predict(Btss_X_test)
# The coefficients
print('Coefficients: \n', regressor.coef_)
# The coefficient of determination: R-squared
print('Train R-Squared: %.2f' % regressor.score(Btss_X_train, Btss_y_train))
print('Test R-Squared: %.2f' % r2_score(Btss_y_test, Btss_y_pred))
# RMSE:
print('Test RMSE: %.2f' % mean_squared_error(Btss_y_test, Btss_y_pred, squared=False))
Coefficients: [ 1.48704070e-03 -9.84573186e-05 2.99008053e-04 1.30104261e-17 0.00000000e+00 1.86891580e-02 0.00000000e+00 -1.06369437e-03 0.00000000e+00 0.00000000e+00] Train R-Squared: 0.83 Test R-Squared: 0.08 Test RMSE: 4.40
results_df.loc[len(results_df)] = ['LR Baseline (BTSS)', mean_squared_error(Btss_y_test,regressor.predict(Btss_X_test) , squared=False)]
results_df
| Algorithm | RMSE | |
|---|---|---|
| 0 | LR Baseline | 17.767744 |
| 1 | LR Baseline (TSS) | 10.468930 |
| 2 | LR Baseline (BTSS) | 4.398897 |
get_feature_importance(regressor, Btss_X_train.columns)
| variable | coefficient | |
|---|---|---|
| 5 | hour | 0.020000 |
| 0 | Thermal_Gap | 0.000000 |
| 1 | fc_wind | -0.000000 |
| 2 | fc_solar | 0.000000 |
| 3 | BoT_FR | 0.000000 |
| 4 | holiday | 0.000000 |
| 6 | weekend | 0.000000 |
| 7 | thermal_gap_daily | -0.000000 |
| 8 | covid_period | 0.000000 |
| 9 | hard_lockdown | 0.000000 |
Btss_df = pd.DataFrame({'Actual': Btss_y_test, 'Predicted': Btss_y_pred})
Btss_df.head(5)
| Actual | Predicted | |
|---|---|---|
| 31690 | 40.29 | 42.613717 |
| 31691 | 41.93 | 43.419018 |
| 31692 | 41.74 | 44.179746 |
| 31693 | 39.99 | 43.137010 |
| 31694 | 38.09 | 40.571554 |
Comments: It appears both TSS and BTSS series splits perform better than a non-random train/test split. Given our discoveries regarding the mechanisms behind electricity price forecasts, this is not surprisng and validates our initial findings.
import statsmodels.formula.api as smf
# Fit regression model
results = smf.ols('price ~ Thermal_Gap + fc_wind + fc_solar + BoT_FR + holiday + hour + weekend + thermal_gap_daily + covid_period + hard_lockdown' , data = pm_df).fit()
# Inspect the results
print(results.summary())
OLS Regression Results
==============================================================================
Dep. Variable: price R-squared: 0.642
Model: OLS Adj. R-squared: 0.642
Method: Least Squares F-statistic: 5762.
Date: Wed, 31 Mar 2021 Prob (F-statistic): 0.00
Time: 03:45:43 Log-Likelihood: -1.1491e+05
No. Observations: 32087 AIC: 2.298e+05
Df Residuals: 32076 BIC: 2.299e+05
Df Model: 10
Covariance Type: nonrobust
=====================================================================================
coef std err t P>|t| [0.025 0.975]
-------------------------------------------------------------------------------------
Intercept 23.1431 0.344 67.250 0.000 22.469 23.818
Thermal_Gap 0.0019 1.67e-05 112.576 0.000 0.002 0.002
fc_wind 4.718e-05 2.24e-05 2.104 0.035 3.23e-06 9.11e-05
fc_solar -0.0003 2.7e-05 -10.009 0.000 -0.000 -0.000
BoT_FR 0.0002 7.46e-05 2.530 0.011 4.25e-05 0.000
holiday 1.0601 0.307 3.455 0.001 0.459 1.661
hour -0.0562 0.009 -6.427 0.000 -0.073 -0.039
weekend 1.7561 0.132 13.354 0.000 1.498 2.014
thermal_gap_daily -0.0001 1.46e-05 -8.843 0.000 -0.000 -0.000
covid_period -12.2366 0.224 -54.513 0.000 -12.677 -11.797
hard_lockdown -7.0708 0.281 -25.193 0.000 -7.621 -6.521
==============================================================================
Omnibus: 390.022 Durbin-Watson: 0.062
Prob(Omnibus): 0.000 Jarque-Bera (JB): 488.686
Skew: 0.194 Prob(JB): 7.64e-107
Kurtosis: 3.464 Cond. No. 1.28e+05
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
[2] The condition number is large, 1.28e+05. This might indicate that there are
strong multicollinearity or other numerical problems.
# Fit regression model (with standardised variables to avoid scaling issues has mentioned earlier - see footnote 2).
#Please note that thet all our coefficients are siginificant the 95% confidence level.
standard_results = smf.ols('price ~ Thermal_Gap + fc_wind + fc_solar + BoT_FR + holiday + hour + weekend + thermal_gap_daily + covid_period + hard_lockdown' , data = standard_pm_df).fit()
# Inspect the results
print(standard_results.summary())
OLS Regression Results
==============================================================================
Dep. Variable: price R-squared: 0.642
Model: OLS Adj. R-squared: 0.642
Method: Least Squares F-statistic: 5762.
Date: Wed, 31 Mar 2021 Prob (F-statistic): 0.00
Time: 03:45:43 Log-Likelihood: -29032.
No. Observations: 32087 AIC: 5.809e+04
Df Residuals: 32076 BIC: 5.818e+04
Df Model: 10
Covariance Type: nonrobust
=====================================================================================
coef std err t P>|t| [0.025 0.975]
-------------------------------------------------------------------------------------
Intercept -1.665e-16 0.003 -4.99e-14 1.000 -0.007 0.007
Thermal_Gap 0.6508 0.006 112.576 0.000 0.639 0.662
fc_wind 0.0105 0.005 2.104 0.035 0.001 0.020
fc_solar -0.0373 0.004 -10.009 0.000 -0.045 -0.030
BoT_FR 0.0095 0.004 2.530 0.011 0.002 0.017
holiday 0.0120 0.003 3.455 0.001 0.005 0.019
hour -0.0268 0.004 -6.427 0.000 -0.035 -0.019
weekend 0.0546 0.004 13.354 0.000 0.047 0.063
thermal_gap_daily -0.0347 0.004 -8.843 0.000 -0.042 -0.027
covid_period -0.2812 0.005 -54.513 0.000 -0.291 -0.271
hard_lockdown -0.1280 0.005 -25.193 0.000 -0.138 -0.118
==============================================================================
Omnibus: 390.022 Durbin-Watson: 0.062
Prob(Omnibus): 0.000 Jarque-Bera (JB): 488.686
Skew: 0.194 Prob(JB): 7.64e-107
Kurtosis: 3.464 Cond. No. 3.52
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
# Replcate feature selection for standardized dataframe. Just like earlier, price is the target variable.
X_std = standard_pm_df.loc[:,['Thermal_Gap', 'fc_wind' , 'fc_solar', 'BoT_FR' , 'holiday' , 'hour', 'weekend' , 'thermal_gap_daily','covid_period', 'hard_lockdown']]
y_std = standard_pm_df['price']
for train_index, test_index in tss.split(X_std):
Tss_std_X_train, Tss_std_X_test = X_std.iloc[train_index, :], X_std.iloc[test_index,:]
Tss_std_y_train, Tss_std_y_test = y_std.iloc[train_index], y_std.iloc[test_index]
from sklearn.linear_model import Ridge, Lasso, ElasticNet
ridge_reg = Ridge(alpha = 1, solver = "cholesky")
ridge_reg.fit(Tss_std_X_train, Tss_std_y_train)
ridge_reg_price_pred = ridge_reg.predict(Tss_std_X_test)
# The coefficients
print('Coefficients: \n', ridge_reg.coef_)
# The coefficient of determination: R-squared
print('Train R-Squared: %.2f' % ridge_reg.score(Tss_std_X_train, Tss_std_y_train))
print('Test R-Squared: %.2f' % r2_score(Tss_std_y_test,ridge_reg_price_pred))
# RMSE:
print('Test RMSE: %.2f' % mean_squared_error(Tss_std_y_test,ridge_reg_price_pred, squared=False))
Coefficients: [ 0.65179432 0.01098931 -0.03736968 0.00975112 0.01202033 -0.02715008 0.05481733 -0.03577501 -0.28414264 -0.12538466] Train R-Squared: 0.64 Test R-Squared: -1.14 Test RMSE: 0.72
#results_df.loc[len(results_df)] = ['Ridge LR Standardized (TSS)', mean_squared_error(Tss_std_y_test,ridge_reg.predict(Tss_std_X_test) , squared=False)]
#results_df
get_feature_importance(ridge_reg, Tss_std_X_train.columns) #coefficients can now be compared. We quickly observe that Thermal Gap , weekend and Covid related variables appear to be the most impactful.
| variable | coefficient | |
|---|---|---|
| 0 | Thermal_Gap | 0.650000 |
| 6 | weekend | 0.050000 |
| 1 | fc_wind | 0.010000 |
| 3 | BoT_FR | 0.010000 |
| 4 | holiday | 0.010000 |
| 5 | hour | -0.030000 |
| 2 | fc_solar | -0.040000 |
| 7 | thermal_gap_daily | -0.040000 |
| 9 | hard_lockdown | -0.130000 |
| 8 | covid_period | -0.280000 |
Ridge_df = pd.DataFrame({'Actual': Tss_std_y_test, 'Predicted':ridge_reg_price_pred})
Ridge_df.head(5)
| Actual | Predicted | |
|---|---|---|
| 32064 | -1.215964 | -1.706056 |
| 32065 | -1.348777 | -1.758084 |
| 32066 | -1.380432 | -1.779648 |
| 32067 | -1.312993 | -1.689734 |
| 32068 | -0.970293 | -1.587577 |
#De-standardizing the predicted output
mean = y.iloc[test_index].mean()
std = y.iloc[test_index].std()
Ridge_df_destandardized = Ridge_df.apply(lambda row:((row*std)+mean),axis =1)
Ridge_df_destandardized.head(5)
| Actual | Predicted | |
|---|---|---|
| 32064 | 33.488895 | 29.904999 |
| 32065 | 32.517670 | 29.524531 |
| 32066 | 32.286187 | 29.366845 |
| 32067 | 32.779348 | 30.024355 |
| 32068 | 35.285409 | 30.771401 |
# RMSE(post de-standardising the variables):
print('Test RMSE: %.2f' % mean_squared_error(Tss_y_test, Ridge_df_destandardized.Predicted, squared=False))
Test RMSE: 9.29
results_df.loc[len(results_df)] = ['Ridge LR Standardized (TSS)', mean_squared_error(Tss_y_test, Ridge_df_destandardized.Predicted, squared=False)]
results_df
| Algorithm | RMSE | |
|---|---|---|
| 0 | LR Baseline | 17.767744 |
| 1 | LR Baseline (TSS) | 10.468930 |
| 2 | LR Baseline (BTSS) | 4.398897 |
| 3 | Ridge LR Standardized (TSS) | 9.294806 |
ridge_reg = Ridge(alpha = 1, solver = "cholesky")
ridge_reg.fit(Tss_X_train, Tss_y_train)
ridge_reg_price_pred = ridge_reg.predict(Tss_X_test)
# The coefficients
print('Coefficients: \n', ridge_reg.coef_)
# The coefficient of determination: R-squared
print('Train R-Squared: %.2f' % ridge_reg.score(Tss_X_train, Tss_y_train))
print('Test R-Squared: %.2f' % r2_score(Tss_y_test,ridge_reg_price_pred))
# RMSE:
print('Test RMSE: %.2f' % mean_squared_error(Tss_y_test,ridge_reg_price_pred, squared=False))
Coefficients: [ 1.88754295e-03 4.95386709e-05 -2.70673823e-04 1.93909334e-04 1.06464989e+00 -5.70630033e-02 1.76402167e+00 -1.33429613e-04 -1.23605789e+01 -6.92684408e+00] Train R-Squared: 0.64 Test R-Squared: -1.14 Test RMSE: 10.47
results_df.loc[len(results_df)] = ['Ridge LR (TSS)', mean_squared_error(Tss_y_test,ridge_reg.predict(Tss_X_test) , squared=False)]
results_df
| Algorithm | RMSE | |
|---|---|---|
| 0 | LR Baseline | 17.767744 |
| 1 | LR Baseline (TSS) | 10.468930 |
| 2 | LR Baseline (BTSS) | 4.398897 |
| 3 | Ridge LR Standardized (TSS) | 9.294806 |
| 4 | Ridge LR (TSS) | 10.466368 |
get_feature_importance(ridge_reg, Tss_X_train.columns)
| variable | coefficient | |
|---|---|---|
| 6 | weekend | 1.760000 |
| 4 | holiday | 1.060000 |
| 0 | Thermal_Gap | 0.000000 |
| 1 | fc_wind | 0.000000 |
| 2 | fc_solar | -0.000000 |
| 3 | BoT_FR | 0.000000 |
| 7 | thermal_gap_daily | -0.000000 |
| 5 | hour | -0.060000 |
| 9 | hard_lockdown | -6.930000 |
| 8 | covid_period | -12.360000 |
Ridge_df = pd.DataFrame({'Actual': Tss_y_test, 'Predicted':ridge_reg_price_pred})
Ridge_df.head(5)
| Actual | Predicted | |
|---|---|---|
| 32112 | 30.82 | 23.699906 |
| 32113 | 28.89 | 22.943585 |
| 32114 | 28.43 | 22.630035 |
| 32115 | 29.41 | 23.936623 |
| 32116 | 34.39 | 25.421256 |
for train_index, test_index in btss.split(X_std):
Btss_std_X_train, Btss_std_X_test = X_std.iloc[train_index, :], X_std.iloc[test_index,:]
Btss_std_y_train, Btss_std_y_test = y_std.iloc[train_index], y_std.iloc[test_index]
ridge_reg = Ridge(alpha = 1, solver = "cholesky")
ridge_reg.fit(Btss_std_X_train, Btss_std_y_train)
ridge_reg_price_pred = ridge_reg.predict(Btss_std_X_test)
# The coefficients
print('Coefficients: \n', ridge_reg.coef_)
# The coefficient of determination: R-squared
print('Train R-Squared: %.2f' % ridge_reg.score(Btss_std_X_train, Btss_std_y_train))
print('Test R-Squared: %.2f' % r2_score(Btss_std_y_test,ridge_reg_price_pred))
# RMSE:
print('Test RMSE: %.2f' % mean_squared_error(Btss_std_y_test,ridge_reg_price_pred, squared=False))
Coefficients: [ 3.89635593e-01 -2.17323079e-02 4.34582026e-02 -1.69899899e-32 -1.35919919e-31 4.71382702e-02 5.43679677e-31 -1.61961437e-01 -2.17471871e-30 -1.35919919e-31] Train R-Squared: 0.81 Test R-Squared: 0.31 Test RMSE: 0.26
#plotting of the BTSS ridge linear regression model
# Plot outputs
X_train1=np.arange(0,len(Btss_std_y_train),1)
grah_lm= sns.jointplot(X_train1, Btss_std_y_train,kind='reg',height=12)
regline = grah_lm.ax_joint.get_lines()[0]
regline.set_color('red')
regline.set_zorder(5)
#plotting the residuals
f, ax = plt.subplots(figsize=(15, 10))
sns.residplot(X_train1, Btss_std_y_train,
scatter_kws={"s": 80});
#results_df.loc[len(results_df)] = ['Ridge LR Standardized (BTSS)', mean_squared_error(Btss_std_y_test,ridge_reg.predict(Btss_std_X_test) , squared=False)]
#results_df
get_feature_importance(ridge_reg, Btss_std_X_train.columns)
| variable | coefficient | |
|---|---|---|
| 0 | Thermal_Gap | 0.390000 |
| 5 | hour | 0.050000 |
| 2 | fc_solar | 0.040000 |
| 3 | BoT_FR | -0.000000 |
| 4 | holiday | -0.000000 |
| 6 | weekend | 0.000000 |
| 8 | covid_period | -0.000000 |
| 9 | hard_lockdown | -0.000000 |
| 1 | fc_wind | -0.020000 |
| 7 | thermal_gap_daily | -0.160000 |
# Plotting the feature importance
importance = regressor.coef_
# summarize feature importance
for i,v in enumerate(importance):
print('Feature: %0d, Score: %.5f' % (i,v))
# plot feature importance
plt.bar([x for x in range(len(importance))], importance)
plt.show()
Feature: 0, Score: 0.00149 Feature: 1, Score: -0.00010 Feature: 2, Score: 0.00030 Feature: 3, Score: 0.00000 Feature: 4, Score: 0.00000 Feature: 5, Score: 0.01869 Feature: 6, Score: 0.00000 Feature: 7, Score: -0.00106 Feature: 8, Score: 0.00000 Feature: 9, Score: 0.00000
Ridge_df = pd.DataFrame({'Actual': Btss_std_y_test, 'Predicted':ridge_reg_price_pred})
Ridge_df.head(5)
| Actual | Predicted | |
|---|---|---|
| 31642 | -0.564283 | -0.389016 |
| 31643 | -0.451426 | -0.331441 |
| 31644 | -0.464501 | -0.280672 |
| 31645 | -0.584928 | -0.329961 |
| 31646 | -0.715677 | -0.452831 |
#De-standardizing the predicted output
mean = y.iloc[test_index].mean()
std = y.iloc[test_index].std()
Ridge_df_destandardized = Ridge_df.apply(lambda row:((row*std)+mean),axis =1)
Ridge_df_destandardized.head(5)
| Actual | Predicted | |
|---|---|---|
| 31642 | 35.732810 | 36.555162 |
| 31643 | 36.262333 | 36.825301 |
| 31644 | 36.200986 | 37.063509 |
| 31645 | 35.635946 | 36.832246 |
| 31646 | 35.022474 | 36.255744 |
# RMSE(post de-standardising the variables):
print('Test RMSE: %.2f' % mean_squared_error(Btss_y_test, Ridge_df_destandardized.Predicted, squared=False))
Test RMSE: 4.48
results_df.loc[len(results_df)] = ['Ridge LR Standardized (BTSS)', mean_squared_error(Btss_y_test, Ridge_df_destandardized.Predicted , squared=False)]
results_df
| Algorithm | RMSE | |
|---|---|---|
| 0 | LR Baseline | 17.767744 |
| 1 | LR Baseline (TSS) | 10.468930 |
| 2 | LR Baseline (BTSS) | 4.398897 |
| 3 | Ridge LR Standardized (TSS) | 9.294806 |
| 4 | Ridge LR (TSS) | 10.466368 |
| 5 | Ridge LR Standardized (BTSS) | 4.484383 |
#sample cross validation
from sklearn.model_selection import cross_val_score
scores = cross_val_score(ridge_reg, Btss_X_train, Btss_y_train, cv=47, scoring='neg_root_mean_squared_error')
ridge_rmse_scores = (-scores)
display_scores(ridge_rmse_scores)
Scores: [-1.29750184 -1.07624028 -2.72954671 -0.09379598 -0.37870647 -0.37261353 -1.06685176 -0.16073656 -0.64014666 -1.60662662 -0.91212136 -1.00720996 -1.16823678 -1.81341611 -1.21237318 -1.91539224 -2.8018882 -3.11633261 -1.42337023 -0.7648373 -1.6550448 -0.70721034 -0.27294407 -0.55149935 -0.6607112 -1.16506773 -0.32117961 -2.00458727 -0.87527721 -1.36212473 -1.14587724 -1.14835288 -1.78674743 -0.61474031 -0.80660676 -0.70278551 -2.08332425 -3.15074703 -1.88024639 -0.62896396 -0.87108668 -1.39606891 -4.54775057 -0.2409639 -0.27246379 -0.83686702 -0.61314131] Mean -1.2310707368305016 Standard Deviation: 0.8946396200395008
from sklearn.model_selection import GridSearchCV
params_Ridge = {'alpha': [1,0.1,0.01,0.001,0.0001,0] , "fit_intercept": [True, False], "solver": ['svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']}
grid_search = GridSearchCV(ridge_reg, params_Ridge , cv = 5 , scoring = 'neg_root_mean_squared_error', return_train_score = True)
grid_search.fit(Btss_X_train, Btss_y_train)
grid_search.best_params_
{'alpha': 0.01, 'fit_intercept': True, 'solver': 'sag'}
grid_search.best_estimator_
Ridge(alpha=0.01, solver='sag')
ridge_reg = Ridge(alpha=1, fit_intercept=False, solver='sparse_cg')
ridge_reg.fit(Btss_std_X_train, Btss_std_y_train)
ridge_reg_price_pred = ridge_reg.predict(Btss_std_X_test)
# The coefficients
print('Coefficients: \n', ridge_reg.coef_)
# The coefficient of determination: R-squared
print('Train R-Squared: %.2f' % ridge_reg.score(Btss_std_X_train, Btss_std_y_train))
print('Test R-Squared: %.2f' % r2_score(Btss_std_y_test,ridge_reg_price_pred))
# RMSE:
print('Test RMSE: %.2f' % mean_squared_error(Btss_std_y_test,ridge_reg_price_pred, squared=False))
Coefficients: [ 0.39298259 -0.01703657 0.04295254 -0.00291961 0.013673 0.04552518 0.05125473 -0.16054282 -0.21165658 0.0230384 ] Train R-Squared: 0.81 Test R-Squared: 0.31 Test RMSE: 0.26
Ridge_df = pd.DataFrame({'Actual': Btss_std_y_test, 'Predicted':ridge_reg_price_pred})
Ridge_df.head(5)
| Actual | Predicted | |
|---|---|---|
| 31642 | -0.564283 | -0.391116 |
| 31643 | -0.451426 | -0.332843 |
| 31644 | -0.464501 | -0.281615 |
| 31645 | -0.584928 | -0.331338 |
| 31646 | -0.715677 | -0.454932 |
#De-standardizing the predicted output
mean = y.iloc[test_index].mean()
std = y.iloc[test_index].std()
Ridge_df_destandardized = Ridge_df.apply(lambda row:((row*std)+mean),axis =1)
Ridge_df_destandardized.head(5)
| Actual | Predicted | |
|---|---|---|
| 31642 | 35.732810 | 36.545305 |
| 31643 | 36.262333 | 36.818721 |
| 31644 | 36.200986 | 37.059085 |
| 31645 | 35.635946 | 36.825786 |
| 31646 | 35.022474 | 36.245885 |
# RMSE(post de-standardising the variables):
print('Test RMSE: %.2f' % mean_squared_error(Btss_y_test, Ridge_df_destandardized.Predicted, squared=False))
Test RMSE: 4.49
ridge_reg = Ridge(alpha = 1, solver = "cholesky")
ridge_reg.fit(Btss_X_train, Btss_y_train)
ridge_reg_price_pred = ridge_reg.predict(Btss_X_test)
# The coefficients
print('Coefficients: \n', ridge_reg.coef_)
# The coefficient of determination: R-squared
print('Train R-Squared: %.2f' % ridge_reg.score(Btss_X_train, Btss_y_train))
print('Test R-Squared: %.2f' % r2_score(Btss_y_test,ridge_reg_price_pred))
# RMSE:
print('Test RMSE: %.2f' % mean_squared_error(Btss_y_test,ridge_reg_price_pred, squared=False))
Coefficients: [ 1.48708781e-03 -9.84211042e-05 2.99014667e-04 0.00000000e+00 0.00000000e+00 1.86659579e-02 0.00000000e+00 -1.06372677e-03 0.00000000e+00 0.00000000e+00] Train R-Squared: 0.83 Test R-Squared: 0.08 Test RMSE: 4.40
results_df.loc[len(results_df)] = ['Ridge LR (BTSS)', mean_squared_error(Btss_y_test,ridge_reg.predict(Btss_X_test) , squared=False)]
results_df
| Algorithm | RMSE | |
|---|---|---|
| 0 | LR Baseline | 17.767744 |
| 1 | LR Baseline (TSS) | 10.468930 |
| 2 | LR Baseline (BTSS) | 4.398897 |
| 3 | Ridge LR Standardized (TSS) | 9.294806 |
| 4 | Ridge LR (TSS) | 10.466368 |
| 5 | Ridge LR Standardized (BTSS) | 4.484383 |
| 6 | Ridge LR (BTSS) | 4.398964 |
get_feature_importance(ridge_reg, Btss_X_train.columns)
| variable | coefficient | |
|---|---|---|
| 5 | hour | 0.020000 |
| 0 | Thermal_Gap | 0.000000 |
| 1 | fc_wind | -0.000000 |
| 2 | fc_solar | 0.000000 |
| 3 | BoT_FR | 0.000000 |
| 4 | holiday | 0.000000 |
| 6 | weekend | 0.000000 |
| 7 | thermal_gap_daily | -0.000000 |
| 8 | covid_period | 0.000000 |
| 9 | hard_lockdown | 0.000000 |
Ridge_df = pd.DataFrame({'Actual': Btss_y_test, 'Predicted':ridge_reg_price_pred})
Ridge_df.head(5)
| Actual | Predicted | |
|---|---|---|
| 31690 | 40.29 | 42.613666 |
| 31691 | 41.93 | 43.418972 |
| 31692 | 41.74 | 44.179703 |
| 31693 | 39.99 | 43.136917 |
| 31694 | 38.09 | 40.571365 |
lasso_reg = Lasso(alpha = 0.1)
lasso_reg.fit(Tss_std_X_train, Tss_std_y_train)
lasso_price_pred = lasso_reg.predict(Tss_std_X_test)
# The coefficients
print('Coefficients: \n', lasso_reg.coef_)
# The coefficient of determination: R-squared
print('Train R-Squared: %.2f' % lasso_reg.score(Tss_std_X_train, Tss_std_y_train))
print('Test R-Squared: %.2f' % r2_score(Tss_std_y_test,lasso_price_pred))
# RMSE:
print('Test RMSE: %.2f' % mean_squared_error(Tss_std_y_test, lasso_price_pred, squared=False))
Coefficients: [ 0.51470219 -0. -0. -0. -0. 0. 0. -0. -0.25457663 -0.08400314] Train R-Squared: 0.62 Test R-Squared: 0.11 Test RMSE: 0.46
#results_df.loc[len(results_df)] = ['Lasso LR Standardized (TSS)', mean_squared_error(Tss_std_y_test,lasso_reg.predict(Tss_std_X_test) , squared=False)]
#results_df
get_feature_importance(lasso_reg, Tss_std_X_train.columns)
| variable | coefficient | |
|---|---|---|
| 0 | Thermal_Gap | 0.510000 |
| 1 | fc_wind | -0.000000 |
| 2 | fc_solar | -0.000000 |
| 3 | BoT_FR | -0.000000 |
| 4 | holiday | -0.000000 |
| 5 | hour | 0.000000 |
| 6 | weekend | 0.000000 |
| 7 | thermal_gap_daily | -0.000000 |
| 9 | hard_lockdown | -0.080000 |
| 8 | covid_period | -0.250000 |
Lasso_df = pd.DataFrame({'Actual': Tss_std_y_test, 'Predicted':lasso_price_pred})
Lasso_df.head(5)
| Actual | Predicted | |
|---|---|---|
| 32064 | -1.215964 | -1.444468 |
| 32065 | -1.348777 | -1.483040 |
| 32066 | -1.380432 | -1.496228 |
| 32067 | -1.312993 | -1.416255 |
| 32068 | -0.970293 | -1.330334 |
#De-standardizing the predicted output
mean = y.iloc[test_index].mean()
std = y.iloc[test_index].std()
Lasso_df_destandardized = Lasso_df.apply(lambda row:((row*std)+mean),axis =1)
Lasso_df_destandardized.head(5)
| Actual | Predicted | |
|---|---|---|
| 32064 | 32.675136 | 31.602996 |
| 32065 | 32.051978 | 31.422021 |
| 32066 | 31.903453 | 31.360140 |
| 32067 | 32.219876 | 31.735371 |
| 32068 | 33.827818 | 32.138511 |
# RMSE(post de-standardising the variables):
print('Test RMSE: %.2f' % mean_squared_error(Tss_y_test, Lasso_df_destandardized.Predicted, squared=False))
Test RMSE: 9.70
results_df.loc[len(results_df)] = ['Lasso LR Standardized (TSS)', mean_squared_error(Tss_y_test, Lasso_df_destandardized.Predicted , squared=False)]
results_df
| Algorithm | RMSE | |
|---|---|---|
| 0 | LR Baseline | 17.767744 |
| 1 | LR Baseline (TSS) | 10.468930 |
| 2 | LR Baseline (BTSS) | 4.398897 |
| 3 | Ridge LR Standardized (TSS) | 9.294806 |
| 4 | Ridge LR (TSS) | 10.466368 |
| 5 | Ridge LR Standardized (BTSS) | 4.484383 |
| 6 | Ridge LR (BTSS) | 4.398964 |
| 7 | Lasso LR Standardized (TSS) | 9.696076 |
lasso_reg = Lasso(alpha = 0.1)
lasso_reg.fit(Tss_X_train, Tss_y_train)
lasso_price_pred = lasso_reg.predict(Tss_X_test)
# The coefficients
print('Coefficients: \n', lasso_reg.coef_)
# The coefficient of determination: R-squared
print('Train R-Squared: %.2f' % lasso_reg.score(Tss_X_train, Tss_y_train))
print('Test R-Squared: %.2f' % r2_score(Tss_y_test,lasso_price_pred))
# RMSE:
print('Test RMSE: %.2f' % mean_squared_error(Tss_y_test, lasso_price_pred, squared=False))
Coefficients: [ 1.89123679e-03 4.06809204e-05 -3.04624201e-04 2.39945248e-04 0.00000000e+00 -5.16262999e-02 1.13010910e+00 -1.68830526e-04 -1.22051686e+01 -5.58177477e+00] Train R-Squared: 0.64 Test R-Squared: -1.18 Test RMSE: 10.55
results_df.loc[len(results_df)] = ['Lasso LR (TSS)', mean_squared_error(Tss_y_test,lasso_reg.predict(Tss_X_test) , squared=False)]
results_df
| Algorithm | RMSE | |
|---|---|---|
| 0 | LR Baseline | 17.767744 |
| 1 | LR Baseline (TSS) | 10.468930 |
| 2 | LR Baseline (BTSS) | 4.398897 |
| 3 | Ridge LR Standardized (TSS) | 9.294806 |
| 4 | Ridge LR (TSS) | 10.466368 |
| 5 | Ridge LR Standardized (BTSS) | 4.484383 |
| 6 | Ridge LR (BTSS) | 4.398964 |
| 7 | Lasso LR Standardized (TSS) | 9.696076 |
| 8 | Lasso LR (TSS) | 10.550067 |
get_feature_importance(lasso_reg, Tss_X_train.columns)
| variable | coefficient | |
|---|---|---|
| 6 | weekend | 1.130000 |
| 0 | Thermal_Gap | 0.000000 |
| 1 | fc_wind | 0.000000 |
| 2 | fc_solar | -0.000000 |
| 3 | BoT_FR | 0.000000 |
| 4 | holiday | 0.000000 |
| 7 | thermal_gap_daily | -0.000000 |
| 5 | hour | -0.050000 |
| 9 | hard_lockdown | -5.580000 |
| 8 | covid_period | -12.210000 |
Lasso_df = pd.DataFrame({'Actual': Tss_y_test, 'Predicted':lasso_price_pred})
Lasso_df.head(5)
| Actual | Predicted | |
|---|---|---|
| 32112 | 30.82 | 23.847397 |
| 32113 | 28.89 | 23.102069 |
| 32114 | 28.43 | 22.798516 |
| 32115 | 29.41 | 24.093410 |
| 32116 | 34.39 | 25.576727 |
lasso_reg = Lasso(alpha = 0.1)
lasso_reg.fit(Btss_std_X_train, Btss_std_y_train)
lasso_price_pred = lasso_reg.predict(Btss_std_X_test)
# The coefficients
print('Coefficients: \n', lasso_reg.coef_)
# The coefficient of determination: R-squared
print('Train R-Squared: %.2f' % lasso_reg.score(Btss_std_X_train, Btss_std_y_train))
print('Test R-Squared: %.2f' % r2_score(Btss_std_y_test,lasso_price_pred))
# RMSE:
print('Test RMSE: %.2f' % mean_squared_error(Btss_std_y_test, lasso_price_pred, squared=False))
Coefficients: [ 0. 0. 0. -0. -0. 0.03804357 0. -0. -0. -0. ] Train R-Squared: 0.18 Test R-Squared: 0.10 Test RMSE: 0.30
#results_df.loc[len(results_df)] = ['Lasso LR Standardized (BTSS)', mean_squared_error(Btss_std_y_test,lasso_reg.predict(Btss_std_X_test) , squared=False)]
#results_df
get_feature_importance(lasso_reg, Btss_std_X_train.columns)
| variable | coefficient | |
|---|---|---|
| 5 | hour | 0.040000 |
| 0 | Thermal_Gap | 0.000000 |
| 1 | fc_wind | 0.000000 |
| 2 | fc_solar | 0.000000 |
| 3 | BoT_FR | -0.000000 |
| 4 | holiday | -0.000000 |
| 6 | weekend | 0.000000 |
| 7 | thermal_gap_daily | -0.000000 |
| 8 | covid_period | -0.000000 |
| 9 | hard_lockdown | -0.000000 |
Lasso_df = pd.DataFrame({'Actual': Btss_std_y_test, 'Predicted':lasso_price_pred})
Lasso_df.head(5)
| Actual | Predicted | |
|---|---|---|
| 31642 | -0.564283 | -0.603679 |
| 31643 | -0.451426 | -0.598183 |
| 31644 | -0.464501 | -0.592687 |
| 31645 | -0.584928 | -0.587191 |
| 31646 | -0.715677 | -0.581695 |
#De-standardizing the predicted output
mean = y.iloc[test_index].mean()
std = y.iloc[test_index].std()
Lasso_df_destandardized = Lasso_df.apply(lambda row:((row*std)+mean),axis =1)
Lasso_df_destandardized.head(5)
| Actual | Predicted | |
|---|---|---|
| 31642 | 35.732810 | 35.547966 |
| 31643 | 36.262333 | 35.573753 |
| 31644 | 36.200986 | 35.599540 |
| 31645 | 35.635946 | 35.625327 |
| 31646 | 35.022474 | 35.651114 |
# RMSE(post de-standardising the variables):
print('Test RMSE: %.2f' % mean_squared_error(Btss_y_test, Lasso_df_destandardized.Predicted, squared=False))
Test RMSE: 5.26
results_df.loc[len(results_df)] = ['Lasso LR Standardized (BTSS)', mean_squared_error(Btss_y_test, Lasso_df_destandardized.Predicted , squared=False)]
results_df
| Algorithm | RMSE | |
|---|---|---|
| 0 | LR Baseline | 17.767744 |
| 1 | LR Baseline (TSS) | 10.468930 |
| 2 | LR Baseline (BTSS) | 4.398897 |
| 3 | Ridge LR Standardized (TSS) | 9.294806 |
| 4 | Ridge LR (TSS) | 10.466368 |
| 5 | Ridge LR Standardized (BTSS) | 4.484383 |
| 6 | Ridge LR (BTSS) | 4.398964 |
| 7 | Lasso LR Standardized (TSS) | 9.696076 |
| 8 | Lasso LR (TSS) | 10.550067 |
| 9 | Lasso LR Standardized (BTSS) | 5.259953 |
lasso_reg = Lasso(alpha = 0.1)
lasso_reg.fit(Btss_X_train, Btss_y_train)
lasso_price_pred = lasso_reg.predict(Btss_X_test)
# The coefficients
print('Coefficients: \n', lasso_reg.coef_)
# The coefficient of determination: R-squared
print('Train R-Squared: %.2f' % lasso_reg.score(Btss_X_train, Btss_y_train))
print('Test R-Squared: %.2f' % r2_score(Btss_y_test,lasso_price_pred))
# RMSE:
print('Test RMSE: %.2f' % mean_squared_error(Btss_y_test, lasso_price_pred, squared=False))
Coefficients: [ 1.49885501e-03 -8.92314367e-05 3.00681607e-04 0.00000000e+00 0.00000000e+00 1.28584487e-02 0.00000000e+00 -1.07162657e-03 0.00000000e+00 0.00000000e+00] Train R-Squared: 0.83 Test R-Squared: 0.08 Test RMSE: 4.42
results_df.loc[len(results_df)] = ['Lasso LR (BTSS)', mean_squared_error(Btss_y_test,lasso_reg.predict(Btss_X_test) , squared=False)]
results_df
| Algorithm | RMSE | |
|---|---|---|
| 0 | LR Baseline | 17.767744 |
| 1 | LR Baseline (TSS) | 10.468930 |
| 2 | LR Baseline (BTSS) | 4.398897 |
| 3 | Ridge LR Standardized (TSS) | 9.294806 |
| 4 | Ridge LR (TSS) | 10.466368 |
| 5 | Ridge LR Standardized (BTSS) | 4.484383 |
| 6 | Ridge LR (BTSS) | 4.398964 |
| 7 | Lasso LR Standardized (TSS) | 9.696076 |
| 8 | Lasso LR (TSS) | 10.550067 |
| 9 | Lasso LR Standardized (BTSS) | 5.259953 |
| 10 | Lasso LR (BTSS) | 4.415596 |
get_feature_importance(lasso_reg, Btss_X_train.columns)
| variable | coefficient | |
|---|---|---|
| 5 | hour | 0.010000 |
| 0 | Thermal_Gap | 0.000000 |
| 1 | fc_wind | -0.000000 |
| 2 | fc_solar | 0.000000 |
| 3 | BoT_FR | 0.000000 |
| 4 | holiday | 0.000000 |
| 6 | weekend | 0.000000 |
| 7 | thermal_gap_daily | -0.000000 |
| 8 | covid_period | 0.000000 |
| 9 | hard_lockdown | 0.000000 |
Lasso_df = pd.DataFrame({'Actual': Btss_y_test, 'Predicted':lasso_price_pred})
Lasso_df.head(5)
| Actual | Predicted | |
|---|---|---|
| 31690 | 40.29 | 42.600894 |
| 31691 | 41.93 | 43.407575 |
| 31692 | 41.74 | 44.169016 |
| 31693 | 39.99 | 43.113808 |
| 31694 | 38.09 | 40.524630 |
# Checking for stationarity in the price and lag
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(20,10))
axes[0].plot(pm_df['price'])
axes[1].plot(pm_df['price_lag_daily'])
fig.tight_layout()
#pm_df = pm_df.iloc[24:]
from statsmodels.tsa.stattools import adfuller
series = pm_df.loc[:, 'price_lag_daily'].values
result = adfuller(series)
print('ADF Statistic: %f' % result[0])
print('p-value: %f' % result[1])
print('Critical Values:')
# Based on this test, it is indeed stationary
ADF Statistic: -28.604032 p-value: 0.000000 Critical Values:
#Sampling
from matplotlib import pyplot
sample_arima = pm_df.iloc[0:13]
series_arima = sample_arima.loc[:, 'price'].values
# Autocorrelation plot
from pandas.plotting import autocorrelation_plot
autocorrelation_plot(series_arima)
pyplot.show()
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
plot_pacf(series_arima, lags = 4)
# Importing ARIMA functions
from pandas import DataFrame
from statsmodels.tsa.arima.model import ARIMA
from matplotlib import pyplot
# ARIMA 2
# fit model
model = ARIMA(series_arima, order=(1,1,0))
model_fit = model.fit()
# summary of fit model
print(model_fit.summary())
# line plot of residuals
residuals = DataFrame(model_fit.resid)
residuals.plot()
pyplot.show()
# density plot of residuals
residuals.plot(kind='kde')
pyplot.show()
# summary stats of residuals
print(residuals.describe())
SARIMAX Results
==============================================================================
Dep. Variable: y No. Observations: 13
Model: ARIMA(1, 1, 0) Log Likelihood -33.386
Date: Wed, 31 Mar 2021 AIC 70.771
Time: 03:45:51 BIC 71.741
Sample: 0 HQIC 70.412
- 13
Covariance Type: opg
==============================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------
ar.L1 0.5839 0.137 4.258 0.000 0.315 0.853
sigma2 14.7561 6.480 2.277 0.023 2.056 27.456
===================================================================================
Ljung-Box (L1) (Q): 0.33 Jarque-Bera (JB): 0.23
Prob(Q): 0.56 Prob(JB): 0.89
Heteroskedasticity (H): 0.03 Skew: 0.28
Prob(H) (two-sided): 0.01 Kurtosis: 2.62
===================================================================================
Warnings:
[1] Covariance matrix calculated using the outer product of gradients (complex-step).
0 count 13.000000 mean 4.756283 std 15.476249 min -6.408647 25% -1.940715 50% 0.009414 75% 3.964902 max 54.680000
# split into train and test sets
#w = pm_df.values
#w = pm_df.loc[:, 'price'].values
#size = int(len(w) * 0.66)
#train, test = w[0:size], w[size:len(w)]
#history = [w for w in train]
#arima_predictions = list()
# walk-forward validation
#for t in range(len(test)):
#model = ARIMA(history, order=(1,1,0)) # adjust to include real one
#model_fit = model.fit()
#output = model_fit.forecast()
#yhat = output[0]
#arima_predictions.append(yhat)
#obs = test[t]
#history.append(obs)
#print('predicted=%f, expected=%f' % (yhat, obs))
#evaluate forecasts
#rmse = sqrt(mean_squared_error(test, arima_predictions))
#print('Test RMSE: %.3f' % rmse)
#plot forecasts against actual outcomes
#pyplot.plot(test)
#pyplot.plot(arima_predictions, color='red')
#pyplot.show()
#results_df.loc[len(results_df)] = ['ARIMA(1,1,0)', mean_squared_error(test, arima_predictions, squared=False)]
#results_df
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()
forest_reg.fit(Tss_X_train, Tss_y_train)
forest_price_pred = forest_reg.predict(Tss_X_test)
# The coefficient of determination: R-squared
print('Train R-Squared: %.2f' % forest_reg.score(Tss_X_train, Tss_y_train))
print('Test R-Squared: %.2f' % r2_score(Tss_y_test,forest_price_pred))
# RMSE:
print('Test RMSE: %.2f' % mean_squared_error(Tss_y_test, forest_price_pred, squared=False))
Train R-Squared: 0.97 Test R-Squared: -0.32 Test RMSE: 8.21
results_df.loc[len(results_df)] = ['Random Forest(TSS)', mean_squared_error(Tss_y_test,forest_reg.predict(Tss_X_test) , squared=False)]
results_df
| Algorithm | RMSE | |
|---|---|---|
| 0 | LR Baseline | 17.767744 |
| 1 | LR Baseline (TSS) | 10.468930 |
| 2 | LR Baseline (BTSS) | 4.398897 |
| 3 | Ridge LR Standardized (TSS) | 9.294806 |
| 4 | Ridge LR (TSS) | 10.466368 |
| 5 | Ridge LR Standardized (BTSS) | 4.484383 |
| 6 | Ridge LR (BTSS) | 4.398964 |
| 7 | Lasso LR Standardized (TSS) | 9.696076 |
| 8 | Lasso LR (TSS) | 10.550067 |
| 9 | Lasso LR Standardized (BTSS) | 5.259953 |
| 10 | Lasso LR (BTSS) | 4.415596 |
| 11 | Random Forest(TSS) | 8.211579 |
forest_df = pd.DataFrame({'Actual': Tss_y_test, 'Predicted':forest_price_pred})
forest_df.head(5)
| Actual | Predicted | |
|---|---|---|
| 32112 | 30.82 | 29.5762 |
| 32113 | 28.89 | 29.1113 |
| 32114 | 28.43 | 28.9801 |
| 32115 | 29.41 | 28.2486 |
| 32116 | 34.39 | 29.2328 |
forest_reg = RandomForestRegressor()
forest_reg.fit(Btss_X_train, Btss_y_train)
forest_price_pred = forest_reg.predict(Btss_X_test)
# The coefficient of determination: R-squared
print('Train R-Squared: %.2f' % forest_reg.score(Btss_X_train, Btss_y_train))
print('Test R-Squared: %.2f' % r2_score(Btss_y_test,forest_price_pred))
# RMSE:
print('Test RMSE: %.2f' % mean_squared_error(Btss_y_test, forest_price_pred, squared=False))
Train R-Squared: 0.98 Test R-Squared: 0.13 Test RMSE: 4.30
results_df.loc[len(results_df)] = ['Random Forest(BTSS)', mean_squared_error(Btss_y_test,forest_reg.predict(Btss_X_test) , squared=False)]
results_df
| Algorithm | RMSE | |
|---|---|---|
| 0 | LR Baseline | 17.767744 |
| 1 | LR Baseline (TSS) | 10.468930 |
| 2 | LR Baseline (BTSS) | 4.398897 |
| 3 | Ridge LR Standardized (TSS) | 9.294806 |
| 4 | Ridge LR (TSS) | 10.466368 |
| 5 | Ridge LR Standardized (BTSS) | 4.484383 |
| 6 | Ridge LR (BTSS) | 4.398964 |
| 7 | Lasso LR Standardized (TSS) | 9.696076 |
| 8 | Lasso LR (TSS) | 10.550067 |
| 9 | Lasso LR Standardized (BTSS) | 5.259953 |
| 10 | Lasso LR (BTSS) | 4.415596 |
| 11 | Random Forest(TSS) | 8.211579 |
| 12 | Random Forest(BTSS) | 4.295087 |
forest_df = pd.DataFrame({'Actual': Btss_y_test, 'Predicted': forest_price_pred})
forest_df.head(5)
| Actual | Predicted | |
|---|---|---|
| 31690 | 40.29 | 41.5146 |
| 31691 | 41.93 | 42.5979 |
| 31692 | 41.74 | 42.7264 |
| 31693 | 39.99 | 42.5400 |
| 31694 | 38.09 | 41.4077 |
scores = cross_val_score(forest_reg, Btss_X_train, Btss_y_train, cv=47, scoring='neg_root_mean_squared_error')
forest_rmse_scores = (-scores)
display_scores(forest_rmse_scores)
Scores: [-0.5168 -0.5874 -0.9456 -0.2753 -0.7825 -0.357 -1.5868 -0.6382 -1.1516 -0.9692 -0.7579 -1.4341 -1.1924 -2.2931 -3.223 -1.3294 -1.8905 -2.8649 -2.0512 -2.0427 -0.0215 -0.7814 -1.0597 -0.5828 -0.3096 -0.0934 -0.6145 -0.3725 -0.3816 -0.0509 -0.2319 -1.4734 -0.6744 -1.5318 -0.4186 -1.084 -1.1275 -0.1944 -0.957 -1.0416 -0.7339 -3.5929 -1.313 -0.2118 -0.2801 -0.4908 -0.2148] Mean -0.9942425531914908 Standard Deviation: 0.8093972216684076
from sklearn.model_selection import GridSearchCV
param_grid = [{'n_estimators': [12,24,365] , 'max_features': [2,3,4,6] },
{'bootstrap': [False], 'n_estimators': [12,24,365] , 'max_features': [2,3,4]}]
grid_search = GridSearchCV(forest_reg, param_grid , cv = 5 , scoring = 'neg_root_mean_squared_error', return_train_score = True)
grid_search.fit(Btss_X_train, Btss_y_train)
grid_search.best_params_
{'max_features': 6, 'n_estimators': 12}
grid_search.best_estimator_
RandomForestRegressor(max_features=6, n_estimators=12)
forest_reg = RandomForestRegressor(max_features=6, n_estimators=365)
forest_reg.fit(Btss_X_train, Btss_y_train)
forest_price_pred = forest_reg.predict(Btss_X_test)
# The coefficient of determination: R-squared
print('Train R-Squared: %.2f' % forest_reg.score(Btss_X_train, Btss_y_train))
print('Test R-Squared: %.2f' % r2_score(Btss_y_test,forest_price_pred))
# RMSE:
print('Test RMSE: %.2f' % mean_squared_error(Btss_y_test, forest_price_pred, squared=False))
Train R-Squared: 0.98 Test R-Squared: 0.21 Test RMSE: 4.10
#s Radom Forest BTSS variable importance
feature_importances = pd.DataFrame(forest_reg.feature_importances_, index = Btss_X_train.columns,columns=['importance']).sort_values('importance', ascending=False)
feature_importances
plt.figure(figsize=(10,8))
#Plot Searborn bar chart
sns.barplot(x=list(feature_importances['importance']), y=Btss_X_train.columns)
#Add chart labels
plt.title('RANDOM FORESTS' + 'FEATURE IMPORTANCE')
plt.xlabel('FEATURE IMPORTANCE')
plt.ylabel('FEATURE NAMES')
Text(0, 0.5, 'FEATURE NAMES')
#Visualizing one tree of the random forest BTSS model
plt.figure(figsize=(30,30))
_ = tree.plot_tree(forest_reg.estimators_[0], feature_names=Btss_X_train.columns, filled=True)
#from sklearn.model_selection import GridSearchCV
#param_grid = [{'n_estimators': [12,24,365] , 'max_features': [2,3,4,6] },
#{'bootstrap': [False], 'n_estimators': [12,24,365] , 'max_features': [2,3,4]}]
#grid_search = GridSearchCV(forest_reg, param_grid , cv = btss , scoring = 'neg_mean_squared_error', return_train_score = True)
#grid_search.fit(Btss_X_train, Btss_y_train)
#grid_search.best_params_
Note: Requires data to be standardized.
from sklearn.svm import LinearSVR
svm_reg = LinearSVR(epsilon = 1.5)
svm_reg.fit(Tss_std_X_train, Tss_std_y_train)
svm_price_pred = svm_reg.predict(Tss_std_X_test)
# The coefficient of determination: R-squared
print('Train R-Squared: %.2f' % svm_reg.score(Tss_std_X_train, Tss_std_y_train))
print('Test R-Squared: %.2f' % r2_score(Tss_std_y_test,svm_price_pred))
# RMSE:
print('Test RMSE: %.2f' % mean_squared_error(Tss_std_y_test, svm_price_pred, squared=False))
Train R-Squared: 0.60 Test R-Squared: 0.71 Test RMSE: 0.26
#results_df.loc[len(results_df)] = ['SVR Standardized', mean_squared_error(Tss_std_y_test,svm_reg.predict(Tss_std_X_test) , squared=False)]
#results_df
svm_df = pd.DataFrame({'Actual': Tss_std_y_test, 'Predicted': svm_price_pred})
svm_df.head(5)
| Actual | Predicted | |
|---|---|---|
| 32064 | -1.215964 | -1.411400 |
| 32065 | -1.348777 | -1.472814 |
| 32066 | -1.380432 | -1.495046 |
| 32067 | -1.312993 | -1.402626 |
| 32068 | -0.970293 | -1.293604 |
#De-standardizing the predicted output
mean = y.iloc[test_index].mean()
std = y.iloc[test_index].std()
svr_df_destandardized = svm_df.apply(lambda row:((row*std)+mean),axis =1)
svr_df_destandardized.head(5)
| Actual | Predicted | |
|---|---|---|
| 32064 | 32.675136 | 31.758154 |
| 32065 | 32.051978 | 31.470001 |
| 32066 | 31.903453 | 31.365689 |
| 32067 | 32.219876 | 31.799318 |
| 32068 | 33.827818 | 32.310847 |
# RMSE(post de-standardising the variables):
#print('Test RMSE: %.2f' % mean_squared_error(Tss_y_test, svr_df_destandardized.Predicted, squared=False))
#results_df.loc[len(results_df)] = ['SVR Standardized', mean_squared_error(Tss_y_test, svr_df_destandardized.Predicted, squared=False)]
#results_df
scores = cross_val_score(svm_reg, Tss_std_X_train, Tss_std_y_train, cv=47, scoring='neg_root_mean_squared_error')
svm_rmse_scores =(-scores)
display_scores(svm_rmse_scores)
Scores: [-1.23321542 -0.42645488 -0.52743087 -0.30799735 -0.42344421 -0.62773267 -0.69443483 -0.65086723 -0.52432152 -0.40201459 -0.42028138 -0.57037387 -0.63418371 -0.39894882 -0.31107176 -0.7599727 -0.66905461 -0.36134085 -0.26678043 -0.35127632 -0.64828022 -0.92592038 -1.30295907 -0.86740712 -0.56674768 -0.66885121 -0.58711961 -0.36310809 -0.30662731 -0.44889917 -0.26306268 -0.48484779 -0.5350697 -0.6809525 -0.69153978 -0.49907671 -0.49481398 -0.75608808 -1.33783268 -0.98248989 -0.99169851 -0.29120639 -0.90492997 -0.42522231 -0.48245251 -0.68579237 -0.55141695] Mean -0.602247036333474 Standard Deviation: 0.2589481176168847
param_grid = [{'epsilon': [0,1.0,2.0] , 'C': [0.75,1,1.25]}]
grid_search = GridSearchCV(svm_reg, param_grid , cv = 10 , scoring = 'neg_root_mean_squared_error', return_train_score = True)
grid_search.fit(Tss_X_train, Tss_y_train)
grid_search.best_params_
{'C': 0.75, 'epsilon': 2.0}
grid_search.best_estimator_
LinearSVR(C=0.75, epsilon=2.0)
svm_reg = LinearSVR(C=0.75, epsilon=0)
svm_reg.fit(Tss_std_X_train, Tss_std_y_train)
svm_price_pred = svm_reg.predict(Tss_std_X_test)
# The coefficient of determination: R-squared
print('Train R-Squared: %.2f' % svm_reg.score(Tss_std_X_train, Tss_std_y_train))
print('Test R-Squared: %.2f' % r2_score(Tss_std_y_test,svm_price_pred))
# RMSE:
print('Test RMSE: %.2f' % mean_squared_error(Tss_std_y_test, svm_price_pred, squared=False))
Train R-Squared: 0.64 Test R-Squared: -0.84 Test RMSE: 0.67
#De-standardizing the predicted output
mean = y.iloc[test_index].mean()
std = y.iloc[test_index].std()
svr_df_destandardized = svm_df.apply(lambda row:((row*std)+mean),axis =1)
svr_df_destandardized.head(5)
| Actual | Predicted | |
|---|---|---|
| 32064 | 32.675136 | 31.758154 |
| 32065 | 32.051978 | 31.470001 |
| 32066 | 31.903453 | 31.365689 |
| 32067 | 32.219876 | 31.799318 |
| 32068 | 33.827818 | 32.310847 |
# RMSE(post de-standardising the variables):
print('Test RMSE: %.2f' % mean_squared_error(Tss_y_test, svr_df_destandardized.Predicted, squared=False))
Test RMSE: 8.59
results_df.loc[len(results_df)] = ['SVR Standardized', mean_squared_error(Tss_y_test, svr_df_destandardized.Predicted, squared=False)]
results_df
| Algorithm | RMSE | |
|---|---|---|
| 0 | LR Baseline | 17.767744 |
| 1 | LR Baseline (TSS) | 10.468930 |
| 2 | LR Baseline (BTSS) | 4.398897 |
| 3 | Ridge LR Standardized (TSS) | 9.294806 |
| 4 | Ridge LR (TSS) | 10.466368 |
| 5 | Ridge LR Standardized (BTSS) | 4.484383 |
| 6 | Ridge LR (BTSS) | 4.398964 |
| 7 | Lasso LR Standardized (TSS) | 9.696076 |
| 8 | Lasso LR (TSS) | 10.550067 |
| 9 | Lasso LR Standardized (BTSS) | 5.259953 |
| 10 | Lasso LR (BTSS) | 4.415596 |
| 11 | Random Forest(TSS) | 8.211579 |
| 12 | Random Forest(BTSS) | 4.295087 |
| 13 | SVR Standardized | 8.591501 |
svm_reg = LinearSVR(epsilon = 1.5)
svm_reg.fit(Btss_std_X_train, Btss_std_y_train)
svm_price_pred = svm_reg.predict(Btss_std_X_test)
# The coefficient of determination: R-squared
print('Train R-Squared: %.2f' % svm_reg.score(Btss_std_X_train, Btss_std_y_train))
print('Test R-Squared: %.2f' % r2_score(Btss_std_y_test,svm_price_pred))
# RMSE:
print('Test RMSE: %.2f' % mean_squared_error(Btss_std_y_test, svm_price_pred, squared=False))
Train R-Squared: -7.09 Test R-Squared: -4.84 Test RMSE: 0.76
#results_df.loc[len(results_df)] = ['SVR Standardized (BTSS)', mean_squared_error(Btss_std_y_test,svm_reg.predict(Btss_std_X_test) , squared=False)]
#results_df
svm_df = pd.DataFrame({'Actual': Btss_std_y_test, 'Predicted': svm_price_pred})
svm_df.head(5)
| Actual | Predicted | |
|---|---|---|
| 31642 | -0.564283 | 0.0 |
| 31643 | -0.451426 | 0.0 |
| 31644 | -0.464501 | 0.0 |
| 31645 | -0.584928 | 0.0 |
| 31646 | -0.715677 | 0.0 |
# RMSE(post de-standardising the variables):
print('Test RMSE: %.2f' % mean_squared_error(Btss_y_test, svm_price_pred, squared=False))
Test RMSE: 38.65
results_df.loc[len(results_df)] = ['SVR (BTSS)', mean_squared_error(Btss_y_test, svm_price_pred, squared=False)]
results_df
| Algorithm | RMSE | |
|---|---|---|
| 0 | LR Baseline | 17.767744 |
| 1 | LR Baseline (TSS) | 10.468930 |
| 2 | LR Baseline (BTSS) | 4.398897 |
| 3 | Ridge LR Standardized (TSS) | 9.294806 |
| 4 | Ridge LR (TSS) | 10.466368 |
| 5 | Ridge LR Standardized (BTSS) | 4.484383 |
| 6 | Ridge LR (BTSS) | 4.398964 |
| 7 | Lasso LR Standardized (TSS) | 9.696076 |
| 8 | Lasso LR (TSS) | 10.550067 |
| 9 | Lasso LR Standardized (BTSS) | 5.259953 |
| 10 | Lasso LR (BTSS) | 4.415596 |
| 11 | Random Forest(TSS) | 8.211579 |
| 12 | Random Forest(BTSS) | 4.295087 |
| 13 | SVR Standardized | 8.591501 |
| 14 | SVR (BTSS) | 38.654286 |
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor(criterion='mse') #max_depth can be used to control size of the tree.
tree_reg.fit(Tss_X_train, Tss_y_train)
tree_price_pred = tree_reg.predict(Tss_X_test)
# The coefficient of determination: R-squared
print('Train R-Squared: %.2f' % tree_reg.score(Tss_X_train, Tss_y_train))
print('Test R-Squared: %.2f' % r2_score(Tss_y_test,tree_price_pred))
# RMSE:
print('Test RMSE: %.2f' % mean_squared_error(Tss_y_test, tree_price_pred, squared=False))
Train R-Squared: 1.00 Test R-Squared: -0.61 Test RMSE: 9.08
results_df.loc[len(results_df)] = [' Decision Tree (TSS)', mean_squared_error(Tss_y_test,tree_reg.predict(Tss_X_test) , squared=False)]
results_df
| Algorithm | RMSE | |
|---|---|---|
| 0 | LR Baseline | 17.767744 |
| 1 | LR Baseline (TSS) | 10.468930 |
| 2 | LR Baseline (BTSS) | 4.398897 |
| 3 | Ridge LR Standardized (TSS) | 9.294806 |
| 4 | Ridge LR (TSS) | 10.466368 |
| 5 | Ridge LR Standardized (BTSS) | 4.484383 |
| 6 | Ridge LR (BTSS) | 4.398964 |
| 7 | Lasso LR Standardized (TSS) | 9.696076 |
| 8 | Lasso LR (TSS) | 10.550067 |
| 9 | Lasso LR Standardized (BTSS) | 5.259953 |
| 10 | Lasso LR (BTSS) | 4.415596 |
| 11 | Random Forest(TSS) | 8.211579 |
| 12 | Random Forest(BTSS) | 4.295087 |
| 13 | SVR Standardized | 8.591501 |
| 14 | SVR (BTSS) | 38.654286 |
| 15 | Decision Tree (TSS) | 9.080890 |
tree_df = pd.DataFrame({'Actual': Tss_y_test, 'Predicted': tree_price_pred})
tree_df.head(5)
| Actual | Predicted | |
|---|---|---|
| 32112 | 30.82 | 25.50 |
| 32113 | 28.89 | 25.50 |
| 32114 | 28.43 | 25.50 |
| 32115 | 29.41 | 25.50 |
| 32116 | 34.39 | 27.44 |
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor(criterion='mse')
tree_reg.fit(Btss_X_train, Btss_y_train)
tree_price_pred = tree_reg.predict(Btss_X_test)
# The coefficient of determination: R-squared
print('Train R-Squared: %.2f' % tree_reg.score(Btss_X_train, Btss_y_train))
print('Test R-Squared: %.2f' % r2_score(Btss_y_test,tree_price_pred))
# RMSE:
print('Test RMSE: %.2f' % mean_squared_error(Btss_y_test, tree_price_pred, squared=False))
Train R-Squared: 1.00 Test R-Squared: -0.07 Test RMSE: 4.74
results_df.loc[len(results_df)] = [' Decision Tree (BTSS)', mean_squared_error(Btss_y_test,tree_reg.predict(Btss_X_test) , squared=False)]
results_df
| Algorithm | RMSE | |
|---|---|---|
| 0 | LR Baseline | 17.767744 |
| 1 | LR Baseline (TSS) | 10.468930 |
| 2 | LR Baseline (BTSS) | 4.398897 |
| 3 | Ridge LR Standardized (TSS) | 9.294806 |
| 4 | Ridge LR (TSS) | 10.466368 |
| 5 | Ridge LR Standardized (BTSS) | 4.484383 |
| 6 | Ridge LR (BTSS) | 4.398964 |
| 7 | Lasso LR Standardized (TSS) | 9.696076 |
| 8 | Lasso LR (TSS) | 10.550067 |
| 9 | Lasso LR Standardized (BTSS) | 5.259953 |
| 10 | Lasso LR (BTSS) | 4.415596 |
| 11 | Random Forest(TSS) | 8.211579 |
| 12 | Random Forest(BTSS) | 4.295087 |
| 13 | SVR Standardized | 8.591501 |
| 14 | SVR (BTSS) | 38.654286 |
| 15 | Decision Tree (TSS) | 9.080890 |
| 16 | Decision Tree (BTSS) | 4.743625 |
tree_df = pd.DataFrame({'Actual': Btss_y_test, 'Predicted': tree_price_pred})
tree_df.head(5)
| Actual | Predicted | |
|---|---|---|
| 31690 | 40.29 | 39.98 |
| 31691 | 41.93 | 45.07 |
| 31692 | 41.74 | 45.07 |
| 31693 | 39.99 | 45.07 |
| 31694 | 38.09 | 39.98 |
scores = cross_val_score(tree_reg, Btss_X_train, Btss_y_train, cv=47, scoring='neg_root_mean_squared_error')
tree_rmse_scores = (-scores)
display_scores(tree_rmse_scores)
Scores: [-0.4 -0.92 -0.53 -0.01 -0.01 -0.54 -1.46 -0.65 -1.49 -0.86 -2.65 -3.27 -2.25 -3.94 -3.94 -2.83 -2.27 -3.68 -3.68 -2.18 -0.2 -1.14 -0.27 -1.15 -0.19 -0.01 -0.74 -0.01 -0.5 -0.5 -1.46 -0.1 -0.86 -1.05 -1.05 -0.84 -2.2 -1.42 -0.99 -0.99 -0.19 -3.14 -0.05 -1.84 -0.29 -0.86 -0.19] Mean -1.272127659574468 Standard Deviation: 1.1524939399299994
param_grid = [{'max_depth': [1,2,4,12,24] , 'max_features': [2,3,4,6] },]
grid_search = GridSearchCV(tree_reg, param_grid , cv = 10 , scoring = 'neg_root_mean_squared_error', return_train_score = True)
grid_search.fit(Btss_X_train, Btss_y_train)
grid_search.best_params_
{'max_depth': 12, 'max_features': 3}
grid_search.best_estimator_
DecisionTreeRegressor(max_depth=12, max_features=3)
tree_reg = DecisionTreeRegressor(max_depth=24, max_features=4, criterion='mse')
tree_reg.fit(Btss_X_train, Btss_y_train)
tree_price_pred = tree_reg.predict(Btss_X_test)
# The coefficient of determination: R-squared
print('Train R-Squared: %.2f' % tree_reg.score(Btss_X_train, Btss_y_train))
print('Test R-Squared: %.2f' % r2_score(Btss_y_test,tree_price_pred))
# RMSE:
print('Test RMSE: %.2f' % mean_squared_error(Btss_y_test, tree_price_pred, squared=False))
Train R-Squared: 1.00 Test R-Squared: 0.57 Test RMSE: 3.00
#results_df.loc[len(results_df)] = [' Decision Tree (BTSS)', mean_squared_error(Btss_y_test,tree_reg.predict(Btss_X_test) , squared=False)]
#results_df
from sklearn.ensemble import GradientBoostingRegressor
boosted_tree_reg = GradientBoostingRegressor()
boosted_tree_reg.fit(Tss_X_train, Tss_y_train)
boosted_tree_price_pred = boosted_tree_reg.predict(Tss_X_test)
# The coefficient of determination: R-squared
print('Train R-Squared: %.2f' % boosted_tree_reg.score(Tss_X_train, Tss_y_train))
print('Test R-Squared: %.2f' % r2_score(Tss_y_test,boosted_tree_price_pred))
# RMSE:
print('Test RMSE: %.2f' % mean_squared_error(Tss_y_test, boosted_tree_price_pred, squared=False))
Train R-Squared: 0.72 Test R-Squared: -1.04 Test RMSE: 10.20
results_df.loc[len(results_df)] = [' Gradient Boosted Tree (TSS)', mean_squared_error(Tss_y_test,boosted_tree_reg.predict(Tss_X_test) , squared=False)]
results_df
| Algorithm | RMSE | |
|---|---|---|
| 0 | LR Baseline | 17.767744 |
| 1 | LR Baseline (TSS) | 10.468930 |
| 2 | LR Baseline (BTSS) | 4.398897 |
| 3 | Ridge LR Standardized (TSS) | 9.294806 |
| 4 | Ridge LR (TSS) | 10.466368 |
| 5 | Ridge LR Standardized (BTSS) | 4.484383 |
| 6 | Ridge LR (BTSS) | 4.398964 |
| 7 | Lasso LR Standardized (TSS) | 9.696076 |
| 8 | Lasso LR (TSS) | 10.550067 |
| 9 | Lasso LR Standardized (BTSS) | 5.259953 |
| 10 | Lasso LR (BTSS) | 4.415596 |
| 11 | Random Forest(TSS) | 8.211579 |
| 12 | Random Forest(BTSS) | 4.295087 |
| 13 | SVR Standardized | 8.591501 |
| 14 | SVR (BTSS) | 38.654286 |
| 15 | Decision Tree (TSS) | 9.080890 |
| 16 | Decision Tree (BTSS) | 4.743625 |
| 17 | Gradient Boosted Tree (TSS) | 10.203069 |
boosted_tree_df = pd.DataFrame({'Actual': Tss_y_test, 'Predicted': boosted_tree_price_pred})
boosted_tree_df.head(5)
| Actual | Predicted | |
|---|---|---|
| 32112 | 30.82 | 28.173279 |
| 32113 | 28.89 | 27.339215 |
| 32114 | 28.43 | 27.526148 |
| 32115 | 29.41 | 27.366025 |
| 32116 | 34.39 | 29.015168 |
boosted_tree_reg = GradientBoostingRegressor()
boosted_tree_reg.fit(Btss_X_train, Btss_y_train)
boosted_tree_price_pred = boosted_tree_reg.predict(Btss_X_test)
# The coefficient of determination: R-squared
print('Train R-Squared: %.2f' % boosted_tree_reg.score(Btss_X_train, Btss_y_train))
print('Test R-Squared: %.2f' % r2_score(Btss_y_test,boosted_tree_price_pred))
# RMSE:
print('Test RMSE: %.2f' % mean_squared_error(Btss_y_test, boosted_tree_price_pred, squared=False))
Train R-Squared: 1.00 Test R-Squared: 0.06 Test RMSE: 4.46
#results_df.loc[len(results_df)] = [' Gradient Boosted Trees (BTSS)', mean_squared_error(Btss_y_test,boosted_tree_reg.predict(Btss_X_test) , squared=False)]
#results_df
boosted_tree_df = pd.DataFrame({'Actual': Btss_y_test, 'Predicted': boosted_tree_price_pred})
boosted_tree_df
| Actual | Predicted | |
|---|---|---|
| 31690 | 40.29 | 41.749988 |
| 31691 | 41.93 | 42.534082 |
| 31692 | 41.74 | 42.831699 |
| 31693 | 39.99 | 42.175186 |
| 31694 | 38.09 | 41.356877 |
| 31695 | 39.00 | 41.356877 |
| 31696 | 41.92 | 41.271297 |
| 31697 | 42.44 | 41.478411 |
| 31698 | 42.79 | 42.167440 |
| 31699 | 43.97 | 43.812386 |
| 31700 | 44.29 | 43.801589 |
| 31701 | 42.44 | 43.603109 |
| 31702 | 41.47 | 42.229440 |
| 31703 | 33.41 | 41.560017 |
| 31704 | 32.24 | 39.581689 |
| 31705 | 31.07 | 39.065647 |
| 31706 | 30.18 | 38.981453 |
| 31707 | 30.07 | 39.051601 |
| 31708 | 32.10 | 38.436468 |
| 31709 | 34.01 | 39.274714 |
| 31710 | 35.74 | 41.117826 |
| 31711 | 39.53 | 42.216625 |
| 31712 | 41.00 | 42.333534 |
| 31713 | 41.42 | 41.801935 |
scores = cross_val_score(boosted_tree_reg, Btss_X_train, Btss_y_train, cv=47, scoring='neg_root_mean_squared_error')
boosted_tree_rmse_scores = (-scores)
display_scores(boosted_tree_rmse_scores)
Scores: [-0.27719948 -1.07765356 -1.31579965 -0.15415254 -0.65147987 -0.89016863 -1.29621958 -0.03341155 -0.38895792 -0.65944598 -0.74014865 -2.13429174 -2.05282176 -3.44471068 -4.028178 -0.28902587 -1.10040751 -3.79079977 -2.88085682 -1.97874247 -0.16330916 -0.67756399 -0.74024787 -0.54308794 -0.17199836 -0.04598239 -0.66982605 -0.07801678 -0.10957865 -0.48794123 -0.85441191 -1.18329846 -0.07698125 -1.19217419 -0.40054507 -0.42744386 -0.36201256 -0.97508113 -1.13869955 -1.41947865 -0.63561527 -4.20532523 -0.74062501 -0.16156945 -0.20612816 -0.2718692 -0.30362576] Mean -1.009083174034604 Standard Deviation: 1.0633120048116216
param_grid = [{'n_estimators': [100,200,300,1000], 'max_depth': [1,2,4,12,24], 'max_features': [2,3,4,6]}]
grid_search = GridSearchCV(boosted_tree_reg, param_grid , cv = 10 , scoring = 'neg_root_mean_squared_error', return_train_score = True)
grid_search.fit(Btss_X_train, Btss_y_train)
grid_search.best_params_
{'max_depth': 1, 'max_features': 6, 'n_estimators': 1000}
grid_search.best_estimator_
GradientBoostingRegressor(max_depth=1, max_features=6, n_estimators=1000)
boosted_tree_reg = GradientBoostingRegressor(max_depth=12, max_features=6, n_estimators=1000)
boosted_tree_reg.fit(Btss_X_train, Btss_y_train)
boosted_tree_price_pred = boosted_tree_reg.predict(Btss_X_test)
# The coefficient of determination: R-squared
print('Train R-Squared: %.2f' % boosted_tree_reg.score(Btss_X_train, Btss_y_train))
print('Test R-Squared: %.2f' % r2_score(Btss_y_test,boosted_tree_price_pred))
# RMSE:
print('Test RMSE: %.2f' % mean_squared_error(Btss_y_test, boosted_tree_price_pred, squared=False))
Train R-Squared: 1.00 Test R-Squared: 0.21 Test RMSE: 4.09
#sort the values based on their importance
feature_importances = pd.DataFrame(boosted_tree_reg.feature_importances_, index = Btss_X_train.columns,columns=['importance']).sort_values('importance', ascending=False)
feature_importances
plt.figure(figsize=(10,8))
#Plot Searborn bar chart
sns.barplot(x=list(feature_importances['importance']), y=Btss_X_train.columns)
#Add chart labels
plt.title('Gradient Boosted Tree' + 'FEATURE IMPORTANCE')
plt.xlabel('FEATURE IMPORTANCE')
plt.ylabel('FEATURE NAMES')
Text(0, 0.5, 'FEATURE NAMES')
# plot decision tree
from numpy import loadtxt
from xgboost import XGBClassifier
from xgboost import plot_tree
import matplotlib.pyplot as plt
#Finally, we'll visualize the original and predicted values in a plot.
x_ax = range(len(Btss_y_test))
plt.scatter(x_ax, Btss_y_test, s=5, color="blue", label="original")
plt.plot(x_ax, boosted_tree_price_pred, lw=0.8, color="red", label="predicted")
plt.legend()
plt.show()
results_df.loc[len(results_df)] = [' Gradient Boosted Trees (BTSS)', mean_squared_error(Btss_y_test,boosted_tree_reg.predict(Btss_X_test) , squared=False)]
results_df
| Algorithm | RMSE | |
|---|---|---|
| 0 | LR Baseline | 17.767744 |
| 1 | LR Baseline (TSS) | 10.468930 |
| 2 | LR Baseline (BTSS) | 4.398897 |
| 3 | Ridge LR Standardized (TSS) | 9.294806 |
| 4 | Ridge LR (TSS) | 10.466368 |
| 5 | Ridge LR Standardized (BTSS) | 4.484383 |
| 6 | Ridge LR (BTSS) | 4.398964 |
| 7 | Lasso LR Standardized (TSS) | 9.696076 |
| 8 | Lasso LR (TSS) | 10.550067 |
| 9 | Lasso LR Standardized (BTSS) | 5.259953 |
| 10 | Lasso LR (BTSS) | 4.415596 |
| 11 | Random Forest(TSS) | 8.211579 |
| 12 | Random Forest(BTSS) | 4.295087 |
| 13 | SVR Standardized | 8.591501 |
| 14 | SVR (BTSS) | 38.654286 |
| 15 | Decision Tree (TSS) | 9.080890 |
| 16 | Decision Tree (BTSS) | 4.743625 |
| 17 | Gradient Boosted Tree (TSS) | 10.203069 |
| 18 | Gradient Boosted Trees (BTSS) | 4.089170 |
import xgboost as xgb
xgb_reg = xgb.XGBRegressor()
xgb_reg.fit(Tss_X_train, Tss_y_train, eval_set = [(Tss_X_test,Tss_y_test)], early_stopping_rounds = 2)
xgb_price_pred = xgb_reg.predict(Tss_X_test)
# The coefficient of determination: R-squared
print('Train R-Squared: %.2f' % xgb_reg.score(Tss_X_train, Tss_y_train))
print('Test R-Squared: %.2f' % r2_score(Tss_y_test,xgb_price_pred))
# RMSE:
print('Test RMSE: %.2f' % mean_squared_error(Tss_y_test, xgb_price_pred, squared=False))
[0] validation_0-rmse:32.39589 [1] validation_0-rmse:25.04206 [2] validation_0-rmse:20.06856 [3] validation_0-rmse:16.33344 [4] validation_0-rmse:14.05281 [5] validation_0-rmse:12.25334 [6] validation_0-rmse:10.87538 [7] validation_0-rmse:10.06489 [8] validation_0-rmse:9.61827 [9] validation_0-rmse:9.24629 [10] validation_0-rmse:9.08200 [11] validation_0-rmse:8.98298 [12] validation_0-rmse:8.80229 [13] validation_0-rmse:8.74208 [14] validation_0-rmse:8.76290 Train R-Squared: 0.75 Test R-Squared: -0.49 Test RMSE: 8.74
results_df.loc[len(results_df)] = [' XGBoost (TSS)', mean_squared_error(Tss_y_test,xgb_reg.predict(Tss_X_test) , squared=False)]
results_df
| Algorithm | RMSE | |
|---|---|---|
| 0 | LR Baseline | 17.767744 |
| 1 | LR Baseline (TSS) | 10.468930 |
| 2 | LR Baseline (BTSS) | 4.398897 |
| 3 | Ridge LR Standardized (TSS) | 9.294806 |
| 4 | Ridge LR (TSS) | 10.466368 |
| 5 | Ridge LR Standardized (BTSS) | 4.484383 |
| 6 | Ridge LR (BTSS) | 4.398964 |
| 7 | Lasso LR Standardized (TSS) | 9.696076 |
| 8 | Lasso LR (TSS) | 10.550067 |
| 9 | Lasso LR Standardized (BTSS) | 5.259953 |
| 10 | Lasso LR (BTSS) | 4.415596 |
| 11 | Random Forest(TSS) | 8.211579 |
| 12 | Random Forest(BTSS) | 4.295087 |
| 13 | SVR Standardized | 8.591501 |
| 14 | SVR (BTSS) | 38.654286 |
| 15 | Decision Tree (TSS) | 9.080890 |
| 16 | Decision Tree (BTSS) | 4.743625 |
| 17 | Gradient Boosted Tree (TSS) | 10.203069 |
| 18 | Gradient Boosted Trees (BTSS) | 4.089170 |
| 19 | XGBoost (TSS) | 8.742080 |
xgb_df = pd.DataFrame({'Actual': Tss_y_test, 'Predicted': xgb_price_pred})
xgb_df.head(5)
| Actual | Predicted | |
|---|---|---|
| 32112 | 30.82 | 29.293978 |
| 32113 | 28.89 | 28.516697 |
| 32114 | 28.43 | 28.747250 |
| 32115 | 29.41 | 28.425951 |
| 32116 | 34.39 | 28.425951 |
xgb_reg = xgb.XGBRegressor()
xgb_reg.fit(Btss_X_train, Btss_y_train, eval_set = [(Btss_X_test,Btss_y_test)], early_stopping_rounds = 2)
xgb_price_pred = xgb_reg.predict(Btss_X_test)
# The coefficient of determination: R-squared
print('Train R-Squared: %.2f' % xgb_reg.score(Btss_X_train, Btss_y_train))
print('Test R-Squared: %.2f' % r2_score(Btss_y_test,xgb_price_pred))
# RMSE:
print('Test RMSE: %.2f' % mean_squared_error(Btss_y_test, xgb_price_pred, squared=False))
[0] validation_0-rmse:26.74565 [1] validation_0-rmse:18.77427 [2] validation_0-rmse:12.99484 [3] validation_0-rmse:9.16575 [4] validation_0-rmse:6.62573 [5] validation_0-rmse:5.04318 [6] validation_0-rmse:4.22457 [7] validation_0-rmse:3.72308 [8] validation_0-rmse:3.63408 [9] validation_0-rmse:3.67644 Train R-Squared: 0.58 Test R-Squared: 0.37 Test RMSE: 3.63
xgb_df = pd.DataFrame({'Actual': Btss_y_test, 'Predicted': xgb_price_pred})
xgb_df.head(5)
| Actual | Predicted | |
|---|---|---|
| 31690 | 40.29 | 39.727024 |
| 31691 | 41.93 | 39.727024 |
| 31692 | 41.74 | 39.727024 |
| 31693 | 39.99 | 39.727024 |
| 31694 | 38.09 | 39.727024 |
scores = cross_val_score(xgb_reg, Btss_X_train, Btss_y_train, cv=47, scoring='neg_root_mean_squared_error')
xgb_rmse_scores = (-scores)
display_scores(xgb_rmse_scores)
Scores: [-0.46249313 -0.63126038 -1.2125766 -0.23918533 -0.56610657 -0.89927551 -1.31368729 -0.67440857 -0.86233292 -0.80303665 -0.53439301 -2.25792953 -1.00716187 -3.48116852 -3.22890823 -1.74317093 -0.42731972 -1.8951416 -4.53085487 -0.81573868 -0.32423172 -0.61013168 -0.63812775 -0.05981262 -0.41375656 -0.50074554 -0.78817795 -0.06764603 -0.46214081 -0.5284407 -0.79828644 -0.98352295 -0.50990753 -1.04829819 -0.15931747 -0.93828979 -1.09538956 -0.74007843 -1.10343933 -1.54846786 -0.67726486 -5.27899582 -1.17625153 -1.00874939 -0.01632996 -0.00603317 -0.42066788] Mean -1.0529500303877157 Standard Deviation: 1.0743075438043515
results_df.loc[len(results_df)] = [' XGBoost (BTSS)', mean_squared_error(Btss_y_test,xgb_reg.predict(Btss_X_test) , squared=False)]
results_df
| Algorithm | RMSE | |
|---|---|---|
| 0 | LR Baseline | 17.767744 |
| 1 | LR Baseline (TSS) | 10.468930 |
| 2 | LR Baseline (BTSS) | 4.398897 |
| 3 | Ridge LR Standardized (TSS) | 9.294806 |
| 4 | Ridge LR (TSS) | 10.466368 |
| 5 | Ridge LR Standardized (BTSS) | 4.484383 |
| 6 | Ridge LR (BTSS) | 4.398964 |
| 7 | Lasso LR Standardized (TSS) | 9.696076 |
| 8 | Lasso LR (TSS) | 10.550067 |
| 9 | Lasso LR Standardized (BTSS) | 5.259953 |
| 10 | Lasso LR (BTSS) | 4.415596 |
| 11 | Random Forest(TSS) | 8.211579 |
| 12 | Random Forest(BTSS) | 4.295087 |
| 13 | SVR Standardized | 8.591501 |
| 14 | SVR (BTSS) | 38.654286 |
| 15 | Decision Tree (TSS) | 9.080890 |
| 16 | Decision Tree (BTSS) | 4.743625 |
| 17 | Gradient Boosted Tree (TSS) | 10.203069 |
| 18 | Gradient Boosted Trees (BTSS) | 4.089170 |
| 19 | XGBoost (TSS) | 8.742080 |
| 20 | XGBoost (BTSS) | 3.634085 |
# Grid of hyperparameters to search over
param_grid = [{'learning_rate': [0.01,0.1,0.5,0.9],'n_estimators' : [200],'subsample' : [0.3,0.5,0.9]}]
grid_search = GridSearchCV(xgb_reg, param_grid , cv = 10 , scoring = 'neg_root_mean_squared_error', return_train_score = True)
grid_search.fit(Btss_X_train, Btss_y_train)
grid_search.best_params_
{'learning_rate': 0.1, 'n_estimators': 200, 'subsample': 0.3}
grid_search.best_estimator_
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
importance_type='gain', interaction_constraints='',
learning_rate=0.1, max_delta_step=0, max_depth=6,
min_child_weight=1, missing=nan, monotone_constraints='()',
n_estimators=200, n_jobs=8, num_parallel_tree=1, random_state=0,
reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=0.3,
tree_method='exact', validate_parameters=1, verbosity=None)
xgb_reg = xgb.XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
importance_type='gain', interaction_constraints='',
learning_rate=0.1, max_delta_step=0, max_depth=6,
min_child_weight=1, monotone_constraints='()',
n_estimators=200, n_jobs=8, num_parallel_tree=1, random_state=0,
reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=0.3,
tree_method='exact', validate_parameters=1, verbosity=None)
xgb_reg.fit(Btss_X_train, Btss_y_train)
xgb_price_pred = xgb_reg.predict(Btss_X_test)
# The coefficient of determination: R-squared
print('Train R-Squared: %.2f' % xgb_reg.score(Btss_X_train, Btss_y_train))
print('Test R-Squared: %.2f' % r2_score(Btss_y_test,xgb_price_pred))
# RMSE:
print('Test RMSE: %.2f' % mean_squared_error(Btss_y_test, xgb_price_pred, squared=False))
Train R-Squared: 1.00 Test R-Squared: 0.08 Test RMSE: 4.42
#Plotting number
from xgboost import XGBClassifier
from xgboost import plot_tree
plot_tree(xgb_reg, num_trees=5) #gathering just a sample of the final tree
fig = plt.gcf()
fig.set_size_inches(30, 15)
#results_df.loc[len(results_df)] = [' XGBoost (BTSS)', mean_squared_error(Btss_y_test,xgb_reg.predict(Btss_X_test) , squared=False)]
#results_df
# Comparison of the Top 3 Models:
models = []
models.append(('Ridge LR', Ridge(alpha = 1, solver = "cholesky")))
models.append(('Random Forest', RandomForestRegressor(max_features=6, n_estimators=365)))
models.append(('Gradient Boost', GradientBoostingRegressor(max_depth=12, max_features=6, n_estimators=1000)))
# Evaluate each model in turn
results = []
names = []
for name, model in models:
# TimeSeries Cross validation
btss = BlockingTimeSeriesSplit(n_splits=446)
cv_results = cross_val_score(model, Btss_X_train, Btss_y_train, cv=47, scoring='neg_root_mean_squared_error')
results.append(cv_results)
names.append(name)
print('%s: %f (%f)' % (name, cv_results.mean(), cv_results.std()))
# Compare Algorithms
plt.boxplot(results, labels=names)
plt.title('Algorithm Comparison')
plt.show()
Ridge LR: -1.231071 (0.894640) Random Forest: -0.991281 (0.806767) Gradient Boost: -0.970753 (1.015548)
#TOO HEAVY TO COMPUTE (BUT WE MANAGED TO OBTAIN RESULTS FOR REGRESSION & RANDOM FOREST , BOTH WORSE)
# Comparison of the Top 3 Models using TSS has a cross-validation technique (whole dataset):
#models = []
#models.append(('Ridge LR', Ridge(alpha = 1, solver = "cholesky")))
#models.append(('Random Forest', RandomForestRegressor(max_features=6, n_estimators=365)))
#models.append(('Gradient Boost', GradientBoostingRegressor(max_depth=12, max_features=6, n_estimators=1000)))
# Evaluate each model in turn
#results = []
#names = []
#for name, model in models:
# TimeSeries Cross validation
#tss = TimeSeriesSplit(n_splits=1338)
#cv_results = cross_val_score(model, X_train, y_train, cv=tss, scoring='neg_root_mean_squared_error')
#results.append(cv_results)
#names.append(name)
#print('%s: %f (%f)' % (name, cv_results.mean(), cv_results.std()))
# Compare Algorithms
#plt.boxplot(results, labels=names)
#plt.title('Algorithm Comparison')
#plt.show()
# Comparison of the Top 3 Models using BTSS has a cross validation technique (whole dataset):
models = []
models.append(('Ridge LR', Ridge(alpha = 1, solver = "cholesky")))
models.append(('Random Forest', RandomForestRegressor(max_features=6, n_estimators=365)))
models.append(('Gradient Boost', GradientBoostingRegressor(max_depth=12, max_features=6, n_estimators=1000)))
# Evaluate each model in turn
results = []
names = []
for name, model in models:
# TimeSeries Cross validation
btss = BlockingTimeSeriesSplit(n_splits=446)
cv_results = cross_val_score(model, X_train, y_train, cv=btss, scoring='neg_root_mean_squared_error')
results.append(cv_results)
names.append(name)
print('%s: %f (%f)' % (name, cv_results.mean(), cv_results.std()))
# Compare Algorithms
plt.boxplot(results, labels=names)
plt.title('Algorithm Comparison')
plt.show()
Ridge LR: -5.070571 (3.563486) Random Forest: -3.969475 (2.617940) Gradient Boost: -4.044481 (2.592353)
scoring_df = pd.read_csv("scoring.csv")
scoring_df.head(5)
| fc_demand | fc_nuclear | import_FR | export_FR | fc_wind | fc_solar_pv | fc_solar_th | date | hour | |
|---|---|---|---|---|---|---|---|---|---|
| 0 | 24744.0 | 7117.2 | 2300.0 | 1750.0 | 2751.0 | 0.0 | 627.1 | 2020-09-01 | 0 |
| 1 | 23426.0 | 7117.2 | 2300.0 | 2100.0 | 2452.0 | 0.0 | 606.6 | 2020-09-01 | 1 |
| 2 | 22597.0 | 7117.2 | 2300.0 | 2100.0 | 2060.0 | 0.0 | 528.3 | 2020-09-01 | 2 |
| 3 | 22285.0 | 7117.2 | 2300.0 | 2100.0 | 1736.0 | 0.0 | 598.5 | 2020-09-01 | 3 |
| 4 | 22192.0 | 7117.2 | 2300.0 | 2100.0 | 1322.0 | 0.0 | 474.9 | 2020-09-01 | 4 |
scoring_df.describe(include = 'all')
| fc_demand | fc_nuclear | import_FR | export_FR | fc_wind | fc_solar_pv | fc_solar_th | date | hour | |
|---|---|---|---|---|---|---|---|---|---|
| count | 3673.000000 | 3673.000000 | 3673.000000 | 3673.000000 | 3673.000000 | 3673.000000 | 3673.000000 | 3673 | 3673.000000 |
| unique | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 153 | NaN |
| top | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 2020-10-25 | NaN |
| freq | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 25 | NaN |
| mean | 28049.892186 | 6762.296624 | 2454.465015 | 2116.430711 | 7595.305472 | 1406.153526 | 306.545576 | NaN | 11.497414 |
| std | 4876.154745 | 467.608441 | 205.749847 | 268.054958 | 4171.177434 | 2050.604746 | 507.322833 | NaN | 6.923961 |
| min | 8644.000000 | 5691.600000 | 2100.000000 | 1200.000000 | 488.000000 | 0.000000 | 0.000000 | NaN | 0.000000 |
| 25% | 23961.000000 | 6125.500000 | 2400.000000 | 2100.000000 | 4214.000000 | 0.000000 | 10.300000 | NaN | 5.000000 |
| 50% | 28347.000000 | 7117.200000 | 2400.000000 | 2200.000000 | 6993.000000 | 0.000000 | 35.800000 | NaN | 11.000000 |
| 75% | 31481.000000 | 7117.200000 | 2500.000000 | 2200.000000 | 10435.000000 | 2742.300000 | 375.000000 | NaN | 17.000000 |
| max | 41615.000000 | 7117.200000 | 3000.000000 | 2800.000000 | 19287.000000 | 6945.200000 | 2150.300000 | NaN | 23.000000 |
#Combining solar energies:
scoring_df['fc_solar'] = scoring_df['fc_solar_th'] + scoring_df['fc_solar_pv']
#Creating the aforementioned Thermal Gap:
scoring_df['Thermal_Gap'] = scoring_df['fc_demand'] - scoring_df['fc_nuclear'] - scoring_df['fc_wind'] - scoring_df['fc_solar']
#Creating Energy Trade Balance SPAIN/FR
scoring_df['BoT_FR'] = scoring_df['export_FR'] - scoring_df['import_FR']
scoring_df = scoring_df.drop(columns = ['import_FR','export_FR','fc_solar_pv','fc_solar_th'], axis = 1)
#Converting dataframe series "date" into datetime to use dt. functionalities:
scoring_df['date'] = pd.to_datetime(scoring_df['date'])
# Adding additional columns with year & day of the year:
scoring_df['Year'] = scoring_df['date'].dt.year
scoring_df['Day_of_Year'] = scoring_df['date'].dt.dayofyear
#Adding column for feature weekend
scoring_df['weekend'] = scoring_df['date'].dt.dayofweek
scoring_df['weekend'].replace({1:0})
scoring_df['weekend'] = scoring_df['weekend'].apply(weekend)
festivos = ['2020-10-12','2020-12-08','2020-12-25','2021-01-01','2021-01-06']
festivosdate = pd.to_datetime(festivos)
scoring_df['holiday'] = scoring_df['date'].isin(festivosdate)
scoring_df['holiday'] = scoring_df['holiday'].apply(holiday)
scoring_df['covid_period'] = pd.Series([1 for x in range(len(scoring_df.index))]) # all forecasting periods are within covid period
scoring_df['hard_lockdown'] = pd.Series([0 for x in range(len(scoring_df.index))]) #no hard lockdowns
Thermal_Gap_df = pd.concat([pm_df.iloc[-24:]['Thermal_Gap'],scoring_df['Thermal_Gap']])
hourly_lag = Thermal_Gap_df.diff()
daily_lag = Thermal_Gap_df.diff(periods = 24)
scoring_df['thermal_gap_lag_hourly']=hourly_lag.iloc[24:]
scoring_df['thermal_gap_daily']=daily_lag.iloc[24:]
scoring_df = scoring_df.drop(columns = ['date'], axis = 1)
scoring_df.head(5)
| fc_demand | fc_nuclear | fc_wind | hour | fc_solar | Thermal_Gap | BoT_FR | Year | Day_of_Year | weekend | holiday | covid_period | hard_lockdown | thermal_gap_lag_hourly | thermal_gap_daily | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 24744.0 | 7117.2 | 2751.0 | 0 | 627.1 | 14248.7 | -550.0 | 2020 | 245 | 0 | 0 | 1 | 0 | -1942.3 | 6115.6 |
| 1 | 23426.0 | 7117.2 | 2452.0 | 1 | 606.6 | 13250.2 | -200.0 | 2020 | 245 | 0 | 0 | 1 | 0 | -998.5 | 6335.1 |
| 2 | 22597.0 | 7117.2 | 2060.0 | 2 | 528.3 | 12891.5 | -200.0 | 2020 | 245 | 0 | 0 | 1 | 0 | -358.7 | 6352.5 |
| 3 | 22285.0 | 7117.2 | 1736.0 | 3 | 598.5 | 12833.3 | -200.0 | 2020 | 245 | 0 | 0 | 1 | 0 | -58.2 | 6422.9 |
| 4 | 22192.0 | 7117.2 | 1322.0 | 4 | 474.9 | 13277.9 | -200.0 | 2020 | 245 | 0 | 0 | 1 | 0 | 444.6 | 6087.7 |
from pandas_profiling import ProfileReport
scoring_report = ProfileReport(scoring_df, minimal=False)
scoring_report
# In case Predictions need to be re-scaled:
scaler = StandardScaler().fit(scoring_df)
standard_scoring_df = scaler.transform(scoring_df)
standard_scoring_df = pd.DataFrame(data = standard_scoring_df, index = None , columns = scoring_df.columns)
standard_scoring_df.head(12)
| fc_demand | fc_nuclear | fc_wind | hour | fc_solar | Thermal_Gap | BoT_FR | Year | Day_of_Year | weekend | holiday | covid_period | hard_lockdown | thermal_gap_lag_hourly | thermal_gap_daily | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | -0.678063 | 0.759079 | -1.161534 | -1.660752 | -0.439704 | 0.379601 | -0.687667 | -0.503996 | -0.015402 | -0.635651 | -0.183778 | 0.0 | 0.0 | -1.144410 | 1.295233 |
| 1 | -0.948395 | 0.759079 | -1.233226 | -1.516306 | -0.448008 | 0.212561 | 0.447816 | -0.503996 | -0.015402 | -0.635651 | -0.183778 | 0.0 | 0.0 | -0.587317 | 1.341298 |
| 2 | -1.118429 | 0.759079 | -1.327217 | -1.371860 | -0.479722 | 0.152554 | 0.447816 | -0.503996 | -0.015402 | -0.635651 | -0.183778 | 0.0 | 0.0 | -0.209664 | 1.344950 |
| 3 | -1.182423 | 0.759079 | -1.404904 | -1.227415 | -0.451288 | 0.142818 | 0.447816 | -0.503996 | -0.015402 | -0.635651 | -0.183778 | 0.0 | 0.0 | -0.032289 | 1.359724 |
| 4 | -1.201498 | 0.759079 | -1.504170 | -1.082969 | -0.501351 | 0.217195 | 0.447816 | -0.503996 | -0.015402 | -0.635651 | -0.183778 | 0.0 | 0.0 | 0.264497 | 1.289378 |
| 5 | -1.135658 | 0.759079 | -1.549967 | -0.938523 | -0.597101 | 0.342395 | 0.447816 | -0.503996 | -0.015402 | -0.635651 | -0.183778 | 0.0 | 0.0 | 0.443820 | 1.270617 |
| 6 | -0.733237 | 0.759079 | -1.541095 | -0.794078 | -0.651497 | 0.686897 | 0.447816 | -0.503996 | -0.015402 | -0.635651 | -0.183778 | 0.0 | 0.0 | 1.217601 | 1.220396 |
| 7 | -0.255952 | 0.759079 | -1.540615 | -0.649632 | -0.675677 | 1.085836 | -0.687667 | -0.503996 | -0.015402 | -0.635651 | -0.183778 | 0.0 | 0.0 | 1.409674 | 1.248917 |
| 8 | 0.093141 | 0.759079 | -1.563873 | -0.505186 | -0.451491 | 1.294196 | -0.687667 | -0.503996 | -0.015402 | -0.635651 | -0.183778 | 0.0 | 0.0 | 0.737242 | 1.186378 |
| 9 | 0.360601 | 0.759079 | -1.601278 | -0.360741 | 0.105552 | 1.308366 | -0.687667 | -0.503996 | -0.015402 | -0.635651 | -0.183778 | 0.0 | 0.0 | 0.052060 | 1.267657 |
| 10 | 0.579656 | 0.759079 | -1.625255 | -0.216295 | 0.801360 | 1.216372 | -0.687667 | -0.503996 | -0.015402 | -0.635651 | -0.183778 | 0.0 | 0.0 | -0.322523 | 1.531014 |
| 11 | 0.673595 | 0.759079 | -1.624536 | -0.071849 | 1.298336 | 1.087224 | -0.687667 | -0.503996 | -0.015402 | -0.635651 | -0.183778 | 0.0 | 0.0 | -0.453621 | 1.232038 |
#Matching variables used for training as well as variable order
scoring_df = scoring_df.loc[:,['Thermal_Gap', 'fc_wind' , 'fc_solar', 'BoT_FR' , 'holiday' , 'hour', 'weekend' , 'thermal_gap_daily', 'covid_period', 'hard_lockdown']]
scoring_df.head()
| Thermal_Gap | fc_wind | fc_solar | BoT_FR | holiday | hour | weekend | thermal_gap_daily | covid_period | hard_lockdown | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 14248.7 | 2751.0 | 627.1 | -550.0 | 0 | 0 | 0 | 6115.6 | 1 | 0 |
| 1 | 13250.2 | 2452.0 | 606.6 | -200.0 | 0 | 1 | 0 | 6335.1 | 1 | 0 |
| 2 | 12891.5 | 2060.0 | 528.3 | -200.0 | 0 | 2 | 0 | 6352.5 | 1 | 0 |
| 3 | 12833.3 | 1736.0 | 598.5 | -200.0 | 0 | 3 | 0 | 6422.9 | 1 | 0 |
| 4 | 13277.9 | 1322.0 | 474.9 | -200.0 | 0 | 4 | 0 | 6087.7 | 1 | 0 |
price_pred = forest_reg.predict(scoring_df)
len(price_pred)
3673
#index = scoring_df['date']
output = pd.DataFrame({'Price': price_pred})
output.to_csv('ML_group_assignment_predictions_group_H.csv', index = False)
output.head()
| Price | |
|---|---|
| 0 | 39.741014 |
| 1 | 38.580630 |
| 2 | 38.277068 |
| 3 | 37.804110 |
| 4 | 38.147562 |
#De-standardizing the predicted output(if needed)
#mean = y.iloc[test_index].mean()
#std = y.iloc[test_index].std()
#svr_df_destandardized = output.apply(lambda row:((row*std)+mean),axis =1)
#svr_df_destandardized.head(5)
X = pm_df.loc[:,['Thermal_Gap', 'fc_wind' , 'fc_solar', 'BoT_FR' , 'holiday' , 'hour', 'weekend' , 'price_lag_daily', 'covid_period', 'hard_lockdown']]
y = pm_df['price']
#Using BTSS
btss = BlockingTimeSeriesSplit(n_splits = 446)
for train_index, test_index in btss.split(X):
Btss_X_train, Btss_X_test = X.iloc[train_index, :], X.iloc[test_index,:]
Btss_y_train, Btss_y_test = y.iloc[train_index], y.iloc[test_index]
forest_reg = RandomForestRegressor()
forest_reg.fit(Btss_X_train, Btss_y_train)
forest_price_pred = forest_reg.predict(Btss_X_test)
# The coefficient of determination: R-squared
print('Train R-Squared: %.2f' % forest_reg.score(Btss_X_train, Btss_y_train))
print('Test R-Squared: %.2f' % r2_score(Btss_y_test,forest_price_pred))
# RMSE:
print('Test RMSE: %.2f' % mean_squared_error(Btss_y_test, forest_price_pred, squared=False))
Train R-Squared: 0.97 Test R-Squared: 0.17 Test RMSE: 4.19
covid_df = pm_df[pm_df['covid_period'] == 1]
len(covid_df)
4103
Covid_X = covid_df.loc[:,['Thermal_Gap', 'fc_wind' , 'fc_solar', 'BoT_FR' , 'holiday' , 'hour', 'weekend' , 'thermal_gap_daily', 'hard_lockdown']]
Covid_y = covid_df['price']
Covid_X_train, Covid_X_test, Covid_y_train, Covid_y_test = train_test_split(Covid_X, Covid_y, test_size=0.2, random_state=42 , shuffle=False) #shuffle = False means we remove data in orderly fashion
Covid_X_test
| Thermal_Gap | fc_wind | fc_solar | BoT_FR | holiday | hour | weekend | thermal_gap_daily | hard_lockdown | |
|---|---|---|---|---|---|---|---|---|---|
| 31314 | 15450.6 | 7779.0 | 4695.2 | 50.0 | 0 | 19.0 | 0 | -2995.5 | 0 |
| 31315 | 15814.3 | 8681.0 | 2429.5 | 50.0 | 0 | 20.0 | 0 | -4293.5 | 0 |
| 31316 | 16769.3 | 8993.0 | 930.5 | 50.0 | 0 | 21.0 | 0 | -4038.7 | 0 |
| 31317 | 16010.7 | 9190.0 | 604.1 | 50.0 | 0 | 22.0 | 0 | -3539.3 | 0 |
| 31318 | 14105.6 | 8910.0 | 576.2 | 50.0 | 0 | 23.0 | 0 | -2818.3 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 32130 | 15218.4 | 2846.0 | 3650.4 | -450.0 | 0 | 19.0 | 0 | 9953.1 | 0 |
| 32131 | 18131.7 | 2861.0 | 1343.1 | -450.0 | 0 | 20.0 | 0 | 10280.9 | 0 |
| 32132 | 20026.3 | 2859.0 | 682.5 | -450.0 | 0 | 21.0 | 0 | 9197.1 | 0 |
| 32133 | 18212.5 | 2771.0 | 598.3 | -450.0 | 0 | 22.0 | 0 | 7631.9 | 0 |
| 32134 | 16191.0 | 2746.0 | 617.8 | -450.0 | 0 | 23.0 | 0 | 6751.7 | 0 |
821 rows × 9 columns
boosted_tree_reg = GradientBoostingRegressor()
boosted_tree_reg.fit(Covid_X_train, Covid_y_train)
boosted_tree_price_pred = boosted_tree_reg.predict(Covid_X_test)
boosted_tree_mse = mean_squared_error(Covid_y_test, boosted_tree_price_pred)
boosted_tree_rmse = np.sqrt(boosted_tree_mse)
# The mean squared error
print(' Root Mean Squared Error: %.2f'
% boosted_tree_rmse)
Root Mean Squared Error: 4.63
required_splits = (len(covid_df.index)/24)
print(required_splits)
170.95833333333334
ctss = TimeSeriesSplit(n_splits = 171)
print(ctss)
TimeSeriesSplit(max_train_size=None, n_splits=171)
for train_index, test_index in ctss.split(Covid_X):
ctss_X_train, ctss_X_test = Covid_X.iloc[train_index, :], Covid_X.iloc[test_index,:]
ctss_y_train, ctss_y_test = Covid_y.iloc[train_index], Covid_y.iloc[test_index]
boosted_tree_reg.fit(ctss_X_train, ctss_y_train)
boosted_tree_price_pred = boosted_tree_reg.predict(ctss_X_test)
boosted_tree_mse = mean_squared_error(ctss_y_test, boosted_tree_price_pred)
boosted_tree_rmse = np.sqrt(boosted_tree_mse)
# The mean squared error
print(' Root Mean Squared Error: %.2f'
% boosted_tree_rmse)
Root Mean Squared Error: 7.81
required_splits = (len(covid_df.index)/(24*3))
print(required_splits)
56.986111111111114
bctss = BlockingTimeSeriesSplit(n_splits = 57 )
print(bctss)
<__main__.BlockingTimeSeriesSplit object at 0x7fd6fcd13d00>
for train_index, test_index in bctss.split(Covid_X):
bctss_X_train, bctss_X_test = Covid_X.iloc[train_index, :], Covid_X.iloc[test_index,:]
bctss_y_train, bctss_y_test = Covid_y.iloc[train_index], Covid_y.iloc[test_index]
boosted_tree_reg.fit(bctss_X_train, bctss_y_train)
boosted_tree_price_pred = boosted_tree_reg.predict(bctss_X_test)
boosted_tree_mse = mean_squared_error(bctss_y_test, boosted_tree_price_pred)
boosted_tree_rmse = np.sqrt(boosted_tree_mse)
# The mean squared error
print(' Root Mean Squared Error: %.2f'
% boosted_tree_rmse)
Root Mean Squared Error: 6.65
year_train_df = pm_df[23327:] ### Beginning the dataset the 01/01/2019
year_train_df.head()
| fc_demand | fc_nuclear | fc_wind | price | hour | fc_solar | Thermal_Gap | BoT_FR | Year | Day_of_Year | weekend | holiday | covid_period | hard_lockdown | price_lag_hourly | price_lag_daily | thermal_gap_lag_hourly | thermal_gap_daily | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 23375 | 25462.0 | 7117.2 | 9759.0 | 33.00 | 0.0 | 462.8 | 8123.0 | -750.0 | 2019 | 245 | 0 | 0 | 0 | 0 | -4.50 | -13.53 | -1233.5 | -6359.0 |
| 23376 | 24014.0 | 7117.2 | 9371.0 | 30.50 | 1.0 | 400.9 | 7124.9 | 0.0 | 2019 | 245 | 0 | 0 | 0 | 0 | -2.50 | -11.51 | -998.1 | -5992.5 |
| 23377 | 23048.0 | 7117.2 | 9054.0 | 25.96 | 2.0 | 228.3 | 6648.5 | 0.0 | 2019 | 245 | 0 | 0 | 0 | 0 | -4.54 | -14.04 | -476.4 | -6027.8 |
| 23378 | 22641.0 | 7117.2 | 9177.0 | 25.00 | 3.0 | 174.1 | 6172.7 | 0.0 | 2019 | 245 | 0 | 0 | 0 | 0 | -0.96 | -16.42 | -475.8 | -5944.7 |
| 23379 | 22487.0 | 7117.2 | 9360.0 | 25.63 | 4.0 | 39.1 | 5970.7 | 0.0 | 2019 | 245 | 0 | 0 | 0 | 0 | 0.63 | -15.45 | -202.0 | -5581.2 |
Year_X = year_train_df.loc[:,['Thermal_Gap', 'fc_wind' , 'fc_solar', 'BoT_FR' , 'holiday' , 'hour', 'weekend' , 'thermal_gap_daily', 'covid_period', 'hard_lockdown']]
Year_y = year_train_df['price']
Year_X_train, Year_X_test, Year_y_train, Year_y_test = train_test_split(Year_X, Year_y, test_size=0.2, random_state=42 , shuffle=False) #shuffle = False means we remove data in orderly fashion
len(year_train_df)
8760
Year_X_test
| Thermal_Gap | fc_wind | fc_solar | BoT_FR | holiday | hour | weekend | thermal_gap_daily | covid_period | hard_lockdown | |
|---|---|---|---|---|---|---|---|---|---|---|
| 30383 | 14502.9 | 3833.0 | 609.6 | -750.0 | 0 | 0.0 | 1 | -416.8 | 1 | 1 |
| 30384 | 13501.5 | 3540.0 | 596.0 | -750.0 | 0 | 1.0 | 1 | -73.1 | 1 | 1 |
| 30385 | 13116.5 | 3071.0 | 567.0 | -250.0 | 0 | 2.0 | 1 | 157.6 | 1 | 1 |
| 30386 | 12975.2 | 2615.0 | 552.3 | -250.0 | 0 | 3.0 | 1 | 79.2 | 1 | 1 |
| 30387 | 13132.6 | 2132.0 | 548.9 | -250.0 | 0 | 4.0 | 1 | 111.4 | 1 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 32130 | 15218.4 | 2846.0 | 3650.4 | -450.0 | 0 | 19.0 | 0 | 9953.1 | 1 | 0 |
| 32131 | 18131.7 | 2861.0 | 1343.1 | -450.0 | 0 | 20.0 | 0 | 10280.9 | 1 | 0 |
| 32132 | 20026.3 | 2859.0 | 682.5 | -450.0 | 0 | 21.0 | 0 | 9197.1 | 1 | 0 |
| 32133 | 18212.5 | 2771.0 | 598.3 | -450.0 | 0 | 22.0 | 0 | 7631.9 | 1 | 0 |
| 32134 | 16191.0 | 2746.0 | 617.8 | -450.0 | 0 | 23.0 | 0 | 6751.7 | 1 | 0 |
1752 rows × 10 columns
required_splits = (len(year_train_df.index)/(24*3))
print(required_splits)
121.66666666666667
bytss = BlockingTimeSeriesSplit(n_splits = 122)
print(bytss)
<__main__.BlockingTimeSeriesSplit object at 0x7fd6fc49e340>
for train_index, test_index in bytss.split(year_train_df):
bytss_X_train, bytss_X_test = Year_X.iloc[train_index, :], Year_X.iloc[test_index,:]
bytss_y_train, bytss_y_test = Year_y.iloc[train_index], Year_y.iloc[test_index]
bytss_y_test.head()
32013 48.38 32014 44.86 32015 46.91 32016 42.91 32017 40.02 Name: price, dtype: float64
boosted_tree_reg.fit(bytss_X_train, bytss_y_train)
boosted_tree_price_pred = boosted_tree_reg.predict(bytss_X_test)
boosted_tree_mse = mean_squared_error(bytss_y_test, boosted_tree_price_pred)
boosted_tree_rmse = np.sqrt(boosted_tree_mse)
print('Train R-Squared: %.2f' % boosted_tree_reg.score(bytss_X_train,bytss_y_train))
print('Test R-Squared: %.2f' % r2_score(bytss_y_test,boosted_tree_price_pred))
# The mean squared error
print('Root Mean Squared Error: %.2f'
% boosted_tree_rmse)
Train R-Squared: 1.00 Test R-Squared: -4.25 Root Mean Squared Error: 8.74
# BoT_FR is the target variable in order to fill in the 'bfill' values
X_imp = pm_df.loc[:, pm_df.columns != 'BoT_FR']
y_imp = pm_df['BoT_FR']
btss_imp = BlockingTimeSeriesSplit(n_splits = 77)
print(btss_imp)
<__main__.BlockingTimeSeriesSplit object at 0x7fd6fc49e730>
for train_index, test_index in btss_imp.split(X_imp):
Btss_X_imp_train, Btss_X_imp_test = X_imp.iloc[train_index, :], X_imp.iloc[test_index,:]
Btss_y_imp_train, Btss_y_imp_test = y_imp.iloc[train_index], y_imp.iloc[test_index]
Btss_X_imp_test.head(5)
| fc_demand | fc_nuclear | fc_wind | price | hour | fc_solar | Thermal_Gap | Year | Day_of_Year | weekend | holiday | covid_period | hard_lockdown | price_lag_hourly | price_lag_daily | thermal_gap_lag_hourly | thermal_gap_daily | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 31941 | 27840.0 | 7117.2 | 6714.0 | 38.07 | 22.0 | 620.5 | 13388.3 | 2020 | 236 | 1 | 0 | 1 | 0 | -0.80 | -1.72 | -580.3 | -2970.6 |
| 31942 | 26342.0 | 7117.2 | 6953.0 | 31.99 | 23.0 | 618.6 | 11653.2 | 2020 | 236 | 1 | 0 | 1 | 0 | -6.08 | -5.88 | -1735.1 | -2634.9 |
| 31943 | 24447.0 | 7117.2 | 6849.0 | 32.26 | 0.0 | 626.5 | 9854.3 | 2020 | 237 | 0 | 0 | 1 | 0 | 0.27 | -5.81 | -1798.9 | -2409.9 |
| 31944 | 22954.0 | 7117.2 | 6712.0 | 27.95 | 1.0 | 615.7 | 8509.1 | 2020 | 237 | 0 | 0 | 1 | 0 | -4.31 | -7.58 | -1345.2 | -2413.4 |
| 31945 | 21964.0 | 7117.2 | 6660.0 | 27.14 | 2.0 | 611.9 | 7574.9 | 2020 | 237 | 0 | 0 | 1 | 0 | -0.81 | -4.74 | -934.2 | -2319.2 |
imp_lr = LinearRegression()
# Train the model using the training sets
imp_lr.fit(Btss_X_imp_train, Btss_y_imp_train)
# Make predictions using the testing set
Btss_BoT_pred = imp_lr.predict(Btss_X_imp_test)
# The coefficients
print('Coefficients: \n', imp_lr.coef_)
#Evaluation Metric
rmse = mean_squared_error(Btss_y_imp_test, Btss_BoT_pred, squared=False)
print('RMSE: %.2f'
% rmse)
# The coefficient of determination:
print('Coefficient of determination: %.2f'
% r2_score(Btss_y_imp_test, Btss_BoT_pred))
Coefficients: [-4.24877770e-02 -1.69969493e-01 9.12309232e-02 8.55525977e+00 -6.92270606e+00 3.20034472e-02 4.24734540e-03 -1.74082970e-13 -2.66895629e+01 -2.41097437e+02 3.84586096e+02 5.68434189e-14 7.10542736e-15 -8.36721440e+00 -1.50416682e+01 2.95086414e-02 5.05099437e-02] RMSE: 273.05 Coefficient of determination: -57.42
#Test Predictions were quite underperforming at first. We dropped any additional attempt of improving upon it since the new scoring dataset was provided.
values_to_be_imputed_df = pd.DataFrame({'Actual': Btss_y_imp_test, 'Predicted': Btss_BoT_pred})
values_to_be_imputed_df.head()
| Actual | Predicted | |
|---|---|---|
| 31941 | 0.0 | -573.456840 |
| 31942 | 0.0 | -464.743180 |
| 31943 | -300.0 | -69.867189 |
| 31944 | -300.0 | 9.370264 |
| 31945 | -300.0 | -26.370294 |